first-data-crawler-pdf/utils.py

158 lines
5.2 KiB
Python

#!/usr/bin/env python3
# pylint: disable=missing-docstring,invalid-name
import random
import re
from collections import namedtuple
from datetime import date as DTDate
from datetime import timedelta as DTTimeDelta
from datetime import datetime as DTDateTime
from typing import BinaryIO, List
import mechanize as m
import PyPDF3
from dateutil.relativedelta import FR, relativedelta
Rate = namedtuple('Rate', ['abbr', 'full_name', 'ask', 'bid'])
# Constants
CARD_MASTERCARD = ['0']
CARD_VISA = ['1']
class CurrencyResult():
def __init__(self):
self.rates = list()
self.card_type = str()
self.date = None
def _parse_rate(text: str) -> float or None:
if re.match('Keine Kursdaten vorhanden', text):
_r = None
else:
# strip whitespace and format decimal numbers correctly for parsing
text = text.strip(' ').replace(',', '.')
try:
_r = float(text)
except ValueError:
_r = None
return _r
def _parse_card_type(text: str) -> str:
# Method for validating metadata from the PDF against the request data
text = text.split(':')[1]
text = text.strip('" ')
return text
def _parse_date(text: str) -> DTDate:
# Method for validating metadata from the PDF against the request data
text = text.split(': ')[1].rstrip()
return DTDateTime.strptime(text, '%d.%m.%Y').date()
def _array_remove_empty(obj: list) -> List[str]:
# just a macro for removing empty or empty-string array objects
try:
while True:
obj.remove('')
except ValueError:
return obj
return obj
def _parse_line(line: str) -> Rate or None:
arr = line.split(" ") # 3 spaces = minimum separation in PDF
arr = _array_remove_empty(arr)
# process currency name
names = arr[0].split(" ", 1)
rate = Rate(
abbr=names[0],
full_name=names[1].strip("()"),
ask=_parse_rate(arr[1]),
bid=_parse_rate(arr[2])
)
return rate
def get_results_from_text(text: str, currency: str = None) -> CurrencyResult:
rates = {}
result = CurrencyResult()
lines = text.splitlines()
# skip intro lines
lines = lines[2:]
# card type
result.card_type = _parse_card_type(lines.pop(0))
# get date
result.date = _parse_date(lines.pop(0))
# skip more lines
lines = lines[4:]
# now the rates begin
if currency is None:
for line in lines:
line_result = _parse_line(line)
rates[line_result.abbr] = line_result
else:
pattern = re.compile("^"+currency)
for line in lines:
if pattern.match(line):
line_result = _parse_line(line)
rates[line_result.abbr] = line_result
result.rates = rates
return result
def get_results_from_pdf(buf: BinaryIO or str, currency: str = None) -> CurrencyResult:
print('Parsing data... ', end='')
reader = PyPDF3.PdfFileReader(buf)
text = str()
pages = []
for num in range(0, reader.getNumPages()-1):
pages.append(reader.getPage(num))
for page in pages:
text += page.extractText()
print('Done.')
return get_results_from_text(text, currency=currency)
def get_fileio(date: DTDate, card_type: List[str] = CARD_VISA) -> BinaryIO: # pylint: disable=dangerous-default-value
# pylint: disable=no-member # mechanize.Browser has some lazy-loading methods that pylint doesn't see
print('Downloading rates for ' + date.strftime('%Y-%m-%d') + '... ', end='')
b = m.Browser()
# Firefox 64 User-Agent
# ua = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0'
# b.set_header('User-Agent', ua)
# Ignore robots.txt
# b.set_handle_robots(False)
# Debugging flags
# b.set_debug_http(True)
# b.set_debug_redirects(True)
# b.set_debug_responses(True)
# PDF URL
b.open('https://misc.firstdata.eu/CurrencyCalculator/fremdwaehrungskurse/pdf')
fm = b.forms()[0]
# This must be done because I can't change the options otherwise
fm.set_all_readonly(False)
# Configure form
fm['creditCardsRadio'] = card_type
fm['selectedDatesString'] = str(date.strftime('%Y%m%d') + ',')
# Retrieve file using button click; the button is 115x21 pixels in size.
# The API apparently doesn't like the max values
rq = fm.click(name='submitButton', coord=(random.randint(1, 114), random.randint(1, 20)))
rq.add_header('Accept', '*/*')
rp = b.retrieve(rq)
print(' Done.')
# Returns an open file-like object with the PDF as contents
return open(rp[0], 'rb')
def get_date() -> DTDate:
# For Sunday and Monday, use Friday's data; Saturday and Sunday are completely null
if DTDate.today().weekday() in [6, 0]:
date = DTDate.today() + relativedelta(weekday=FR(-1))
else:
# For all other days, the previous day is fine
date = DTDate.today() - DTTimeDelta(1)
return date
def mk_filename(date: DTDate, card_type: List[str]) -> str:
# List[str] is used because I don't want to make a class for just this
if card_type == CARD_MASTERCARD:
fn = date.isoformat() + '_MC.pdf'
elif card_type == CARD_VISA:
fn = date.isoformat() + '_VISA.pdf'
else:
raise TypeError("not a valid card type")
return fn