first-data-crawler-pdf/utils.py

#!/usr/bin/env python3
# pylint: disable=missing-docstring,invalid-name

import random
import re
from collections import namedtuple
from datetime import date as DTDate
from datetime import timedelta as DTTimeDelta
from datetime import datetime as DTDateTime
from typing import BinaryIO, List

import mechanize as m
import PyPDF3
from dateutil.relativedelta import FR, relativedelta

Rate = namedtuple('Rate', ['abbr', 'full_name', 'ask', 'bid'])

# Constants
CARD_MASTERCARD = ['0']
CARD_VISA = ['1']


class CurrencyResult:
    def __init__(self):
        self.rates = list()
        self.card_type = str()
        self.date = None


def _parse_rate(text: str) -> float or None:
    if re.match('Keine Kursdaten vorhanden', text):
        _r = None
    else:
        # strip whitespace and format decimal numbers correctly for parsing
        text = text.strip(' ').replace(',', '.')
        try:
            _r = float(text)
        except ValueError:
            _r = None
    return _r


def _parse_card_type(text: str) -> str:
    # Method for validating metadata from the PDF against the request data
    text = text.split(':')[1]
    text = text.strip('" ')
    return text


def _parse_date(text: str) -> DTDate:
    # Method for validating metadata from the PDF against the request data
    text = text.split(': ')[1].rstrip()
    return DTDateTime.strptime(text, '%d.%m.%Y').date()


def _array_remove_empty(obj: list) -> List[str]:
    # just a macro for removing empty or empty-string array objects
    try:
        while True:
            obj.remove('')
    except ValueError:
        return obj
    return obj


def _parse_line(line: str) -> Rate or None:
    arr = line.split("   ") # 3 spaces = minimum separation in PDF
    arr = _array_remove_empty(arr)
    # process currency name
    names = arr[0].split(" ", 1)
    rate = Rate(
        abbr=names[0],
        full_name=names[1].strip("()"),
        ask=_parse_rate(arr[1]),
        bid=_parse_rate(arr[2])
        )
    return rate


def get_results_from_text(text: str, currency: str = None) -> CurrencyResult:
    rates = {}
    result = CurrencyResult()
    lines = text.splitlines()
    # skip intro lines
    lines = lines[2:]
    # card type
    result.card_type = _parse_card_type(lines.pop(0))
    # get date
    result.date = _parse_date(lines.pop(0))
    # skip more lines
    lines = lines[4:]
    # now the rates begin
    if currency is None:
        for line in lines:
            line_result = _parse_line(line)
            rates[line_result.abbr] = line_result
    else:
        pattern = re.compile("^"+currency)
        for line in lines:
            if pattern.match(line):
                line_result = _parse_line(line)
                rates[line_result.abbr] = line_result
    result.rates = rates
    return result


def get_results_from_pdf(buf: BinaryIO or str, currency: str = None) -> CurrencyResult:
    print('Parsing data... ', end='')
    reader = PyPDF3.PdfFileReader(buf)
    text = str()
    for num in range(0, reader.getNumPages()-1):
        text += reader.getPage(num).extractText()
    print('Done.')
    return get_results_from_text(text, currency=currency)


def get_fileio(date: DTDate, card_type: List[str] = CARD_VISA) -> BinaryIO: # pylint: disable=dangerous-default-value
    # pylint: disable=no-member # mechanize.Browser has some lazy-loading methods that pylint doesn't see
    print('Downloading rates for ' + date.strftime('%Y-%m-%d') + '... ', end='')
    b = m.Browser()
    # Firefox 64 User-Agent
    # ua = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0'
    # b.set_header('User-Agent', ua)
    # Ignore robots.txt
    # b.set_handle_robots(False)
    # Debugging flags
    # b.set_debug_http(True)
    # b.set_debug_redirects(True)
    # b.set_debug_responses(True)
    # PDF URL
    b.open('https://misc.firstdata.eu/CurrencyCalculator/fremdwaehrungskurse/pdf')
    fm = b.forms()[0]
    # This must be done because I can't change the options otherwise
    fm.set_all_readonly(False)
    # Configure form
    fm['creditCardsRadio'] = card_type
    fm['selectedDatesString'] = str(date.strftime('%Y%m%d') + ',')
    # Retrieve file using button click; the button is 115x21 pixels in size.
    # The API apparently doesn't like the max values
    rq = fm.click(name='submitButton', coord=(random.randint(1, 114), random.randint(1, 20)))
    rq.add_header('Accept', '*/*')
    rp = b.retrieve(rq)
    print(' Done.')
    # Returns an open file-like object with the PDF as contents
    return open(rp[0], 'rb')


def get_date() -> DTDate:
    # For Sunday and Monday, use Friday's data; Saturday and Sunday are completely null
    if DTDate.today().weekday() in [6, 0]:
        date = DTDate.today() + relativedelta(weekday=FR(-1))
    else:
    # For all other days, the previous day is fine
        date = DTDate.today() - DTTimeDelta(1)
    return date


def mk_filename(date: DTDate, card_type: List[str]) -> str:
    # List[str] is used because I don't want to make a class for just this
    if card_type == CARD_MASTERCARD:
        fn = date.isoformat() + '_MC.pdf'
    elif card_type == CARD_VISA:
        fn = date.isoformat() + '_VISA.pdf'
    else:
        raise TypeError("not a valid card type")
    return fn