first-data-crawler-pdf/utils.py

#!/usr/bin/env python3
# pylint: disable=missing-docstring,invalid-name

import random
import re
from collections import namedtuple
from datetime import date as DTDate
from datetime import timedelta as DTTimeDelta
from datetime import datetime as DTDateTime
from typing import BinaryIO, List, Dict
from collections import OrderedDict
from sys import stderr

import mechanize as m
import PyPDF3
from dateutil.relativedelta import FR, relativedelta

Rate = namedtuple('Rate', ['abbr', 'full_name', 'ask', 'bid', 'date'])

# Constants
CARD_MASTERCARD = ['0']
CARD_VISA = ['1']

class CurrencyResult:
    def __init__(self):
        self.rates = Dict[str, Rate]
        self.card_type = str()
        self.date = None


def _parse_rate(text: str) -> float or None:
    if re.match('Keine Kursdaten vorhanden', text):
        _r = None
    else:
        # strip whitespace and format decimal numbers correctly for parsing
        text = text.strip(' ').replace(',', '.')
        try:
            _r = float(text)
        except ValueError:
            _r = None
    return _r


def _parse_card_type(text: str) -> str:
    # Method for validating metadata from the PDF against the request data
    text = text.split(':')[1]
    text = text.strip('" ')
    return text


def _parse_date(text: str) -> DTDate:
    # Method for validating metadata from the PDF against the request data
    text = text.split(': ')[1].rstrip()
    return DTDateTime.strptime(text, '%d.%m.%Y').date()


def _array_remove_empty(obj: list) -> List[str]:
    # just a macro for removing empty or empty-string array objects
    try:
        while True:
            obj.remove('')
    except ValueError:
        return obj
    return obj


def _parse_line(line: str, ctx: CurrencyResult) -> Rate or None:
    arr = line.split("   ") # 3 spaces = minimum separation in PDF
    arr = _array_remove_empty(arr)
    # process currency name
    names = arr[0].split(" ", 1)
    rate = Rate(
        abbr=names[0],
        full_name=names[1].strip("()"),
        ask=_parse_rate(arr[1]),
        bid=_parse_rate(arr[2]),
        date=ctx.date
        )
    return rate


def get_results_from_text(text: str, currency: str = None, quiet: bool = False) -> CurrencyResult:
    rates = OrderedDict()
    result = CurrencyResult()
    lines = text.splitlines()
    # skip intro lines
    lines = lines[2:]
    # card type
    result.card_type = _parse_card_type(lines.pop(0))
    # get date
    result.date = _parse_date(lines.pop(0))
    # skip more lines
    lines = lines[4:]
    # now the rates begin
    if currency is None:
        for line in lines:
            line_result = _parse_line(line, result)
            rates[line_result.abbr] = line_result
    else:
        pattern = re.compile("^"+currency)
        for line in lines:
            if pattern.match(line):
                line_result = _parse_line(line, result)
                rates[line_result.abbr] = line_result
    result.rates = rates
    return result


def get_results_from_pdf(buf: BinaryIO or str, currency: str = None, quiet: bool = False) -> CurrencyResult:
    if not quiet:
        print('Parsing data... ', end='', file=stderr)
    reader = PyPDF3.PdfFileReader(buf)
    text = str()
    for num in range(0, reader.getNumPages()-1):
        text += reader.getPage(num).extractText()
    if not quiet:
        print('Done.', file=stderr)
    return get_results_from_text(text, currency=currency, quiet=quiet)


def get_fileio(date: DTDate, card_type: List[str] = CARD_VISA, quiet: bool = False) -> BinaryIO: # pylint: disable=dangerous-default-value
    # pylint: disable=no-member # mechanize.Browser has some lazy-loading methods that pylint doesn't see
    if not quiet:
        print('Downloading rates for ' + date.strftime('%Y-%m-%d') + '... ', end='', file=stderr)
    b = m.Browser()
    # Firefox 64 User-Agent
    # ua = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0'
    # b.set_header('User-Agent', ua)
    # Ignore robots.txt
    b.set_handle_robots(False)
    # Debugging flags
    # b.set_debug_http(True)
    # b.set_debug_redirects(True)
    # b.set_debug_responses(True)
    # PDF URL
    b.open('https://online.firstdata.com/CurrencyCalculator/fremdwaehrungskurse/pdf')
    fm = b.forms()[0]
    # This must be done because I can't change the options otherwise
    fm.set_all_readonly(False)
    # Configure form
    fm['creditCardsRadio'] = card_type
    fm['selectedDatesString'] = str(date.strftime('%Y%m%d') + ',')
    # Retrieve file using button click; the button is 115x21 pixels in size.
    # The API apparently doesn't like the max values
    rq = fm.click(name='submitButton', coord=(random.randint(1, 114), random.randint(1, 20)))
    rq.add_header('Accept', '*/*')
    rp = b.retrieve(rq)
    if not quiet:
        print(' Done.', file=stderr)
    # Returns an open file-like object with the PDF as contents
    return open(rp[0], 'rb')


def get_date() -> DTDate:
    # For Sunday and Monday, use Friday's data; Saturday and Sunday are completely null
    if DTDate.today().weekday() in [6, 0]:
        date = DTDate.today() + relativedelta(weekday=FR(-1))
    else:
    # For all other days, the previous day is fine
        date = DTDate.today() - DTTimeDelta(1)
    return date


def mk_filename(date: DTDate, card_type: List[str]) -> str:
    # List[str] is used because I don't want to make a class for just this
    if card_type == CARD_MASTERCARD:
        fn = date.isoformat() + '_MC.pdf'
    elif card_type == CARD_VISA:
        fn = date.isoformat() + '_VISA.pdf'
    else:
        raise TypeError("not a valid card type")
    return fn
Initial commit 2019-03-27 17:00:04 +01:00			`#!/usr/bin/env python3`
			`# pylint: disable=missing-docstring,invalid-name`

i don't even know, i'm drunk 2019-05-04 21:54:34 +02:00			`import random`
Initial commit 2019-03-27 17:00:04 +01:00			`import re`
i don't even know, i'm drunk 2019-05-04 21:54:34 +02:00			`from collections import namedtuple`
[refactor] move some functions to utils.py 2019-05-04 20:55:02 +02:00			`from datetime import date as DTDate`
			`from datetime import timedelta as DTTimeDelta`
i don't even know, i'm drunk 2019-05-04 21:54:34 +02:00			`from datetime import datetime as DTDateTime`
Add CSV writing feature 2020-07-18 20:22:33 +02:00			`from typing import BinaryIO, List, Dict`
			`from collections import OrderedDict`
			`from sys import stderr`
[refactor] move some functions to utils.py 2019-05-04 20:55:02 +02:00
			`import mechanize as m`
i don't even know, i'm drunk 2019-05-04 21:54:34 +02:00			`import PyPDF3`
			`from dateutil.relativedelta import FR, relativedelta`
Initial commit 2019-03-27 17:00:04 +01:00
Add CSV writing feature 2020-07-18 20:22:33 +02:00			`Rate = namedtuple('Rate', ['abbr', 'full_name', 'ask', 'bid', 'date'])`
Initial commit 2019-03-27 17:00:04 +01:00
[refactor] move some functions to utils.py 2019-05-04 20:55:02 +02:00			`# Constants`
			`CARD_MASTERCARD = ['0']`
			`CARD_VISA = ['1']`

PEP8ing whitespace 2019-06-21 22:16:16 +02:00			`class CurrencyResult:`
Initial commit 2019-03-27 17:00:04 +01:00			`def __init__(self):`
Add CSV writing feature 2020-07-18 20:22:33 +02:00			`self.rates = Dict[str, Rate]`
Initial commit 2019-03-27 17:00:04 +01:00			`self.card_type = str()`
			`self.date = None`

PEP8ing whitespace 2019-06-21 22:16:16 +02:00
Initial commit 2019-03-27 17:00:04 +01:00			`def _parse_rate(text: str) -> float or None:`
			`if re.match('Keine Kursdaten vorhanden', text):`
			`_r = None`
			`else:`
more docs, if/elif fixed, more functions in utils 2019-05-04 22:55:18 +02:00			`# strip whitespace and format decimal numbers correctly for parsing`
Initial commit 2019-03-27 17:00:04 +01:00			`text = text.strip(' ').replace(',', '.')`
			`try:`
			`_r = float(text)`
			`except ValueError:`
			`_r = None`
			`return _r`

PEP8ing whitespace 2019-06-21 22:16:16 +02:00
Initial commit 2019-03-27 17:00:04 +01:00			`def _parse_card_type(text: str) -> str:`
more docs, if/elif fixed, more functions in utils 2019-05-04 22:55:18 +02:00			`# Method for validating metadata from the PDF against the request data`
Initial commit 2019-03-27 17:00:04 +01:00			`text = text.split(':')[1]`
			`text = text.strip('" ')`
			`return text`

PEP8ing whitespace 2019-06-21 22:16:16 +02:00
i don't even know, i'm drunk 2019-05-04 21:54:34 +02:00			`def _parse_date(text: str) -> DTDate:`
more docs, if/elif fixed, more functions in utils 2019-05-04 22:55:18 +02:00			`# Method for validating metadata from the PDF against the request data`
Initial commit 2019-03-27 17:00:04 +01:00			`text = text.split(': ')[1].rstrip()`
i don't even know, i'm drunk 2019-05-04 21:54:34 +02:00			`return DTDateTime.strptime(text, '%d.%m.%Y').date()`
Initial commit 2019-03-27 17:00:04 +01:00
PEP8ing whitespace 2019-06-21 22:16:16 +02:00
Initial commit 2019-03-27 17:00:04 +01:00			`def _array_remove_empty(obj: list) -> List[str]:`
more docs, if/elif fixed, more functions in utils 2019-05-04 22:55:18 +02:00			`# just a macro for removing empty or empty-string array objects`
Initial commit 2019-03-27 17:00:04 +01:00			`try:`
			`while True:`
			`obj.remove('')`
			`except ValueError:`
			`return obj`
			`return obj`

PEP8ing whitespace 2019-06-21 22:16:16 +02:00
Add CSV writing feature 2020-07-18 20:22:33 +02:00			`def _parse_line(line: str, ctx: CurrencyResult) -> Rate or None:`
Initial commit 2019-03-27 17:00:04 +01:00			`arr = line.split(" ") # 3 spaces = minimum separation in PDF`
			`arr = _array_remove_empty(arr)`
			`# process currency name`
			`names = arr[0].split(" ", 1)`
			`rate = Rate(`
			`abbr=names[0],`
			`full_name=names[1].strip("()"),`
			`ask=_parse_rate(arr[1]),`
Add CSV writing feature 2020-07-18 20:22:33 +02:00			`bid=_parse_rate(arr[2]),`
			`date=ctx.date`
Initial commit 2019-03-27 17:00:04 +01:00			`)`
			`return rate`


Add quiet flag for usage in cronjobs 2020-08-24 17:20:36 +02:00			`def get_results_from_text(text: str, currency: str = None, quiet: bool = False) -> CurrencyResult:`
Add CSV writing feature 2020-07-18 20:22:33 +02:00			`rates = OrderedDict()`
Initial commit 2019-03-27 17:00:04 +01:00			`result = CurrencyResult()`
			`lines = text.splitlines()`
			`# skip intro lines`
			`lines = lines[2:]`
			`# card type`
			`result.card_type = _parse_card_type(lines.pop(0))`
			`# get date`
			`result.date = _parse_date(lines.pop(0))`
			`# skip more lines`
			`lines = lines[4:]`
			`# now the rates begin`
			`if currency is None:`
			`for line in lines:`
Add CSV writing feature 2020-07-18 20:22:33 +02:00			`line_result = _parse_line(line, result)`
Initial commit 2019-03-27 17:00:04 +01:00			`rates[line_result.abbr] = line_result`
			`else:`
			`pattern = re.compile("^"+currency)`
			`for line in lines:`
			`if pattern.match(line):`
Add CSV writing feature 2020-07-18 20:22:33 +02:00			`line_result = _parse_line(line, result)`
Initial commit 2019-03-27 17:00:04 +01:00			`rates[line_result.abbr] = line_result`
			`result.rates = rates`
			`return result`

PEP8ing whitespace 2019-06-21 22:16:16 +02:00
Add quiet flag for usage in cronjobs 2020-08-24 17:20:36 +02:00			`def get_results_from_pdf(buf: BinaryIO or str, currency: str = None, quiet: bool = False) -> CurrencyResult:`
			`if not quiet:`
			`print('Parsing data... ', end='', file=stderr)`
Initial commit 2019-03-27 17:00:04 +01:00			`reader = PyPDF3.PdfFileReader(buf)`
			`text = str()`
			`for num in range(0, reader.getNumPages()-1):`
remove unnecessary array 2019-05-08 11:15:55 +02:00			`text += reader.getPage(num).extractText()`
Add quiet flag for usage in cronjobs 2020-08-24 17:20:36 +02:00			`if not quiet:`
			`print('Done.', file=stderr)`
			`return get_results_from_text(text, currency=currency, quiet=quiet)`
PEP8ing whitespace 2019-06-21 22:16:16 +02:00

Add quiet flag for usage in cronjobs 2020-08-24 17:20:36 +02:00			`def get_fileio(date: DTDate, card_type: List[str] = CARD_VISA, quiet: bool = False) -> BinaryIO: # pylint: disable=dangerous-default-value`
i don't even know, i'm drunk 2019-05-04 21:54:34 +02:00			`# pylint: disable=no-member # mechanize.Browser has some lazy-loading methods that pylint doesn't see`
Add quiet flag for usage in cronjobs 2020-08-24 17:20:36 +02:00			`if not quiet:`
			`print('Downloading rates for ' + date.strftime('%Y-%m-%d') + '... ', end='', file=stderr)`
[refactor] move some functions to utils.py 2019-05-04 20:55:02 +02:00			`b = m.Browser()`
			`# Firefox 64 User-Agent`
			`# ua = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0'`
			`# b.set_header('User-Agent', ua)`
			`# Ignore robots.txt`
New URL with robots.txt 2020-08-18 23:49:19 +02:00			`b.set_handle_robots(False)`
[refactor] move some functions to utils.py 2019-05-04 20:55:02 +02:00			`# Debugging flags`
			`# b.set_debug_http(True)`
			`# b.set_debug_redirects(True)`
			`# b.set_debug_responses(True)`
			`# PDF URL`
New URL with robots.txt 2020-08-18 23:49:19 +02:00			`b.open('https://online.firstdata.com/CurrencyCalculator/fremdwaehrungskurse/pdf')`
more docs, if/elif fixed, more functions in utils 2019-05-04 22:55:18 +02:00			`fm = b.forms()[0]`
			`# This must be done because I can't change the options otherwise`
			`fm.set_all_readonly(False)`
[refactor] move some functions to utils.py 2019-05-04 20:55:02 +02:00			`# Configure form`
more docs, if/elif fixed, more functions in utils 2019-05-04 22:55:18 +02:00			`fm['creditCardsRadio'] = card_type`
			`fm['selectedDatesString'] = str(date.strftime('%Y%m%d') + ',')`
			`# Retrieve file using button click; the button is 115x21 pixels in size.`
			`# The API apparently doesn't like the max values`
			`rq = fm.click(name='submitButton', coord=(random.randint(1, 114), random.randint(1, 20)))`
[refactor] move some functions to utils.py 2019-05-04 20:55:02 +02:00			`rq.add_header('Accept', '/')`
			`rp = b.retrieve(rq)`
Add quiet flag for usage in cronjobs 2020-08-24 17:20:36 +02:00			`if not quiet:`
			`print(' Done.', file=stderr)`
more docs, if/elif fixed, more functions in utils 2019-05-04 22:55:18 +02:00			`# Returns an open file-like object with the PDF as contents`
[refactor] move some functions to utils.py 2019-05-04 20:55:02 +02:00			`return open(rp[0], 'rb')`
PEP8ing whitespace 2019-06-21 22:16:16 +02:00

[refactor] move some functions to utils.py 2019-05-04 20:55:02 +02:00			`def get_date() -> DTDate:`
more docs, if/elif fixed, more functions in utils 2019-05-04 22:55:18 +02:00			`# For Sunday and Monday, use Friday's data; Saturday and Sunday are completely null`
[refactor] move some functions to utils.py 2019-05-04 20:55:02 +02:00			`if DTDate.today().weekday() in [6, 0]:`
			`date = DTDate.today() + relativedelta(weekday=FR(-1))`
			`else:`
more docs, if/elif fixed, more functions in utils 2019-05-04 22:55:18 +02:00			`# For all other days, the previous day is fine`
[refactor] move some functions to utils.py 2019-05-04 20:55:02 +02:00			`date = DTDate.today() - DTTimeDelta(1)`
			`return date`
more docs, if/elif fixed, more functions in utils 2019-05-04 22:55:18 +02:00
PEP8ing whitespace 2019-06-21 22:16:16 +02:00
more docs, if/elif fixed, more functions in utils 2019-05-04 22:55:18 +02:00			`def mk_filename(date: DTDate, card_type: List[str]) -> str:`
			`# List[str] is used because I don't want to make a class for just this`
			`if card_type == CARD_MASTERCARD:`
			`fn = date.isoformat() + '_MC.pdf'`
			`elif card_type == CARD_VISA:`
			`fn = date.isoformat() + '_VISA.pdf'`
			`else:`
			`raise TypeError("not a valid card type")`
			`return fn`
Add CSV writing feature 2020-07-18 20:22:33 +02:00