first-data-crawler-pdf/utils.py

#!/usr/bin/env python3
# pylint: disable=missing-docstring,invalid-name

import random
import re
from collections import namedtuple
from datetime import date as DTDate
from datetime import timedelta as DTTimeDelta
from datetime import datetime as DTDateTime
from typing import BinaryIO, List

import mechanize as m
import PyPDF3
from dateutil.relativedelta import FR, relativedelta

Rate = namedtuple('Rate', ['abbr', 'full_name', 'ask', 'bid'])

# Constants
CARD_MASTERCARD = ['0']
CARD_VISA = ['1']

class CurrencyResult():
    def __init__(self):
        self.rates = list()
        self.card_type = str()
        self.date = None

def _parse_rate(text: str) -> float or None:
    if re.match('Keine Kursdaten vorhanden', text):
        _r = None
    else:
        # strip whitespace and format decimal numbers correctly for parsing
        text = text.strip(' ').replace(',', '.')
        try:
            _r = float(text)
        except ValueError:
            _r = None
    return _r

def _parse_card_type(text: str) -> str:
    # Method for validating metadata from the PDF against the request data
    text = text.split(':')[1]
    text = text.strip('" ')
    return text

def _parse_date(text: str) -> DTDate:
    # Method for validating metadata from the PDF against the request data
    text = text.split(': ')[1].rstrip()
    return DTDateTime.strptime(text, '%d.%m.%Y').date()

def _array_remove_empty(obj: list) -> List[str]:
    # just a macro for removing empty or empty-string array objects
    try:
        while True:
            obj.remove('')
    except ValueError:
        return obj
    return obj

def _parse_line(line: str) -> Rate or None:
    arr = line.split("   ") # 3 spaces = minimum separation in PDF
    arr = _array_remove_empty(arr)
    # process currency name
    names = arr[0].split(" ", 1)
    rate = Rate(
        abbr=names[0],
        full_name=names[1].strip("()"),
        ask=_parse_rate(arr[1]),
        bid=_parse_rate(arr[2])
        )
    return rate


def get_results_from_text(text: str, currency: str = None) -> CurrencyResult:
    rates = {}
    result = CurrencyResult()
    lines = text.splitlines()
    # skip intro lines
    lines = lines[2:]
    # card type
    result.card_type = _parse_card_type(lines.pop(0))
    # get date
    result.date = _parse_date(lines.pop(0))
    # skip more lines
    lines = lines[4:]
    # now the rates begin
    if currency is None:
        for line in lines:
            line_result = _parse_line(line)
            rates[line_result.abbr] = line_result
    else:
        pattern = re.compile("^"+currency)
        for line in lines:
            if pattern.match(line):
                line_result = _parse_line(line)
                rates[line_result.abbr] = line_result
    result.rates = rates
    return result

def get_results_from_pdf(buf: BinaryIO or str, currency: str = None) -> CurrencyResult:
    print('Parsing data... ', end='')
    reader = PyPDF3.PdfFileReader(buf)
    text = str()
    pages = []
    for num in range(0, reader.getNumPages()-1):
        pages.append(reader.getPage(num))
    for page in pages:
        text += page.extractText()
    print('Done.')
    return get_results_from_text(text, currency=currency)
def get_fileio(date: DTDate, card_type: List[str] = CARD_VISA) -> BinaryIO: # pylint: disable=dangerous-default-value
    # pylint: disable=no-member # mechanize.Browser has some lazy-loading methods that pylint doesn't see
    print('Downloading rates for ' + date.strftime('%Y-%m-%d') + '... ', end='')
    b = m.Browser()
    # Firefox 64 User-Agent
    # ua = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0'
    # b.set_header('User-Agent', ua)
    # Ignore robots.txt
    # b.set_handle_robots(False)
    # Debugging flags
    # b.set_debug_http(True)
    # b.set_debug_redirects(True)
    # b.set_debug_responses(True)
    # PDF URL
    b.open('https://misc.firstdata.eu/CurrencyCalculator/fremdwaehrungskurse/pdf')
    fm = b.forms()[0]
    # This must be done because I can't change the options otherwise
    fm.set_all_readonly(False)
    # Configure form
    fm['creditCardsRadio'] = card_type
    fm['selectedDatesString'] = str(date.strftime('%Y%m%d') + ',')
    # Retrieve file using button click; the button is 115x21 pixels in size.
    # The API apparently doesn't like the max values
    rq = fm.click(name='submitButton', coord=(random.randint(1, 114), random.randint(1, 20)))
    rq.add_header('Accept', '*/*')
    rp = b.retrieve(rq)
    print(' Done.')
    # Returns an open file-like object with the PDF as contents
    return open(rp[0], 'rb')
def get_date() -> DTDate:
    # For Sunday and Monday, use Friday's data; Saturday and Sunday are completely null
    if DTDate.today().weekday() in [6, 0]:
        date = DTDate.today() + relativedelta(weekday=FR(-1))
    else:
    # For all other days, the previous day is fine
        date = DTDate.today() - DTTimeDelta(1)
    return date

def mk_filename(date: DTDate, card_type: List[str]) -> str:
    # List[str] is used because I don't want to make a class for just this
    if card_type == CARD_MASTERCARD:
        fn = date.isoformat() + '_MC.pdf'
    elif card_type == CARD_VISA:
        fn = date.isoformat() + '_VISA.pdf'
    else:
        raise TypeError("not a valid card type")
    return fn
Initial commit 2019-03-27 17:00:04 +01:00			`#!/usr/bin/env python3`
			`# pylint: disable=missing-docstring,invalid-name`

i don't even know, i'm drunk 2019-05-04 21:54:34 +02:00			`import random`
Initial commit 2019-03-27 17:00:04 +01:00			`import re`
i don't even know, i'm drunk 2019-05-04 21:54:34 +02:00			`from collections import namedtuple`
[refactor] move some functions to utils.py 2019-05-04 20:55:02 +02:00			`from datetime import date as DTDate`
			`from datetime import timedelta as DTTimeDelta`
i don't even know, i'm drunk 2019-05-04 21:54:34 +02:00			`from datetime import datetime as DTDateTime`
			`from typing import BinaryIO, List`
[refactor] move some functions to utils.py 2019-05-04 20:55:02 +02:00
			`import mechanize as m`
i don't even know, i'm drunk 2019-05-04 21:54:34 +02:00			`import PyPDF3`
			`from dateutil.relativedelta import FR, relativedelta`
Initial commit 2019-03-27 17:00:04 +01:00
			`Rate = namedtuple('Rate', ['abbr', 'full_name', 'ask', 'bid'])`

[refactor] move some functions to utils.py 2019-05-04 20:55:02 +02:00			`# Constants`
			`CARD_MASTERCARD = ['0']`
			`CARD_VISA = ['1']`

Initial commit 2019-03-27 17:00:04 +01:00			`class CurrencyResult():`
			`def __init__(self):`
			`self.rates = list()`
			`self.card_type = str()`
			`self.date = None`

			`def _parse_rate(text: str) -> float or None:`
			`if re.match('Keine Kursdaten vorhanden', text):`
			`_r = None`
			`else:`
more docs, if/elif fixed, more functions in utils 2019-05-04 22:55:18 +02:00			`# strip whitespace and format decimal numbers correctly for parsing`
Initial commit 2019-03-27 17:00:04 +01:00			`text = text.strip(' ').replace(',', '.')`
			`try:`
			`_r = float(text)`
			`except ValueError:`
			`_r = None`
			`return _r`

			`def _parse_card_type(text: str) -> str:`
more docs, if/elif fixed, more functions in utils 2019-05-04 22:55:18 +02:00			`# Method for validating metadata from the PDF against the request data`
Initial commit 2019-03-27 17:00:04 +01:00			`text = text.split(':')[1]`
			`text = text.strip('" ')`
			`return text`

i don't even know, i'm drunk 2019-05-04 21:54:34 +02:00			`def _parse_date(text: str) -> DTDate:`
more docs, if/elif fixed, more functions in utils 2019-05-04 22:55:18 +02:00			`# Method for validating metadata from the PDF against the request data`
Initial commit 2019-03-27 17:00:04 +01:00			`text = text.split(': ')[1].rstrip()`
i don't even know, i'm drunk 2019-05-04 21:54:34 +02:00			`return DTDateTime.strptime(text, '%d.%m.%Y').date()`
Initial commit 2019-03-27 17:00:04 +01:00
			`def _array_remove_empty(obj: list) -> List[str]:`
more docs, if/elif fixed, more functions in utils 2019-05-04 22:55:18 +02:00			`# just a macro for removing empty or empty-string array objects`
Initial commit 2019-03-27 17:00:04 +01:00			`try:`
			`while True:`
			`obj.remove('')`
			`except ValueError:`
			`return obj`
			`return obj`

			`def _parse_line(line: str) -> Rate or None:`
			`arr = line.split(" ") # 3 spaces = minimum separation in PDF`
			`arr = _array_remove_empty(arr)`
			`# process currency name`
			`names = arr[0].split(" ", 1)`
			`rate = Rate(`
			`abbr=names[0],`
			`full_name=names[1].strip("()"),`
			`ask=_parse_rate(arr[1]),`
			`bid=_parse_rate(arr[2])`
			`)`
			`return rate`


			`def get_results_from_text(text: str, currency: str = None) -> CurrencyResult:`
			`rates = {}`
			`result = CurrencyResult()`
			`lines = text.splitlines()`
			`# skip intro lines`
			`lines = lines[2:]`
			`# card type`
			`result.card_type = _parse_card_type(lines.pop(0))`
			`# get date`
			`result.date = _parse_date(lines.pop(0))`
			`# skip more lines`
			`lines = lines[4:]`
			`# now the rates begin`
			`if currency is None:`
			`for line in lines:`
			`line_result = _parse_line(line)`
			`rates[line_result.abbr] = line_result`
			`else:`
			`pattern = re.compile("^"+currency)`
			`for line in lines:`
			`if pattern.match(line):`
			`line_result = _parse_line(line)`
			`rates[line_result.abbr] = line_result`
			`result.rates = rates`
			`return result`

			`def get_results_from_pdf(buf: BinaryIO or str, currency: str = None) -> CurrencyResult:`
i don't even know, i'm drunk 2019-05-04 21:54:34 +02:00			`print('Parsing data... ', end='')`
Initial commit 2019-03-27 17:00:04 +01:00			`reader = PyPDF3.PdfFileReader(buf)`
			`text = str()`
			`pages = []`
			`for num in range(0, reader.getNumPages()-1):`
			`pages.append(reader.getPage(num))`
			`for page in pages:`
			`text += page.extractText()`
i don't even know, i'm drunk 2019-05-04 21:54:34 +02:00			`print('Done.')`
Initial commit 2019-03-27 17:00:04 +01:00			`return get_results_from_text(text, currency=currency)`
[refactor] move some functions to utils.py 2019-05-04 20:55:02 +02:00			`def get_fileio(date: DTDate, card_type: List[str] = CARD_VISA) -> BinaryIO: # pylint: disable=dangerous-default-value`
i don't even know, i'm drunk 2019-05-04 21:54:34 +02:00			`# pylint: disable=no-member # mechanize.Browser has some lazy-loading methods that pylint doesn't see`
[refactor] move some functions to utils.py 2019-05-04 20:55:02 +02:00			`print('Downloading rates for ' + date.strftime('%Y-%m-%d') + '... ', end='')`
			`b = m.Browser()`
			`# Firefox 64 User-Agent`
			`# ua = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0'`
			`# b.set_header('User-Agent', ua)`
			`# Ignore robots.txt`
			`# b.set_handle_robots(False)`
			`# Debugging flags`
			`# b.set_debug_http(True)`
			`# b.set_debug_redirects(True)`
			`# b.set_debug_responses(True)`
			`# PDF URL`
			`b.open('https://misc.firstdata.eu/CurrencyCalculator/fremdwaehrungskurse/pdf')`
more docs, if/elif fixed, more functions in utils 2019-05-04 22:55:18 +02:00			`fm = b.forms()[0]`
			`# This must be done because I can't change the options otherwise`
			`fm.set_all_readonly(False)`
[refactor] move some functions to utils.py 2019-05-04 20:55:02 +02:00			`# Configure form`
more docs, if/elif fixed, more functions in utils 2019-05-04 22:55:18 +02:00			`fm['creditCardsRadio'] = card_type`
			`fm['selectedDatesString'] = str(date.strftime('%Y%m%d') + ',')`
			`# Retrieve file using button click; the button is 115x21 pixels in size.`
			`# The API apparently doesn't like the max values`
			`rq = fm.click(name='submitButton', coord=(random.randint(1, 114), random.randint(1, 20)))`
[refactor] move some functions to utils.py 2019-05-04 20:55:02 +02:00			`rq.add_header('Accept', '/')`
			`rp = b.retrieve(rq)`
			`print(' Done.')`
more docs, if/elif fixed, more functions in utils 2019-05-04 22:55:18 +02:00			`# Returns an open file-like object with the PDF as contents`
[refactor] move some functions to utils.py 2019-05-04 20:55:02 +02:00			`return open(rp[0], 'rb')`
			`def get_date() -> DTDate:`
more docs, if/elif fixed, more functions in utils 2019-05-04 22:55:18 +02:00			`# For Sunday and Monday, use Friday's data; Saturday and Sunday are completely null`
[refactor] move some functions to utils.py 2019-05-04 20:55:02 +02:00			`if DTDate.today().weekday() in [6, 0]:`
			`date = DTDate.today() + relativedelta(weekday=FR(-1))`
			`else:`
more docs, if/elif fixed, more functions in utils 2019-05-04 22:55:18 +02:00			`# For all other days, the previous day is fine`
[refactor] move some functions to utils.py 2019-05-04 20:55:02 +02:00			`date = DTDate.today() - DTTimeDelta(1)`
			`return date`
more docs, if/elif fixed, more functions in utils 2019-05-04 22:55:18 +02:00
			`def mk_filename(date: DTDate, card_type: List[str]) -> str:`
			`# List[str] is used because I don't want to make a class for just this`
			`if card_type == CARD_MASTERCARD:`
			`fn = date.isoformat() + '_MC.pdf'`
			`elif card_type == CARD_VISA:`
			`fn = date.isoformat() + '_VISA.pdf'`
			`else:`
			`raise TypeError("not a valid card type")`
			`return fn`