first-data-crawler-pdf/utils.py

172 lines
5.3 KiB
Python
Raw Normal View History

2019-03-27 17:00:04 +01:00
#!/usr/bin/env python3
# pylint: disable=missing-docstring,invalid-name
2019-05-04 21:54:34 +02:00
import random
2019-03-27 17:00:04 +01:00
import re
2019-05-04 21:54:34 +02:00
from collections import namedtuple
from datetime import date as DTDate
from datetime import timedelta as DTTimeDelta
2019-05-04 21:54:34 +02:00
from datetime import datetime as DTDateTime
2020-07-18 20:22:33 +02:00
from typing import BinaryIO, List, Dict
from collections import OrderedDict
from sys import stderr
import mechanize as m
2019-05-04 21:54:34 +02:00
import PyPDF3
from dateutil.relativedelta import FR, relativedelta
2019-03-27 17:00:04 +01:00
2020-07-18 20:22:33 +02:00
Rate = namedtuple('Rate', ['abbr', 'full_name', 'ask', 'bid', 'date'])
2019-03-27 17:00:04 +01:00
# Constants
CARD_MASTERCARD = ['0']
CARD_VISA = ['1']
2019-06-21 22:16:16 +02:00
class CurrencyResult:
2019-03-27 17:00:04 +01:00
def __init__(self):
2020-07-18 20:22:33 +02:00
self.rates = Dict[str, Rate]
2019-03-27 17:00:04 +01:00
self.card_type = str()
self.date = None
2019-06-21 22:16:16 +02:00
2019-03-27 17:00:04 +01:00
def _parse_rate(text: str) -> float or None:
if re.match('Keine Kursdaten vorhanden', text):
_r = None
else:
# strip whitespace and format decimal numbers correctly for parsing
2019-03-27 17:00:04 +01:00
text = text.strip(' ').replace(',', '.')
try:
_r = float(text)
except ValueError:
_r = None
return _r
2019-06-21 22:16:16 +02:00
2019-03-27 17:00:04 +01:00
def _parse_card_type(text: str) -> str:
# Method for validating metadata from the PDF against the request data
2019-03-27 17:00:04 +01:00
text = text.split(':')[1]
text = text.strip('" ')
return text
2019-06-21 22:16:16 +02:00
2019-05-04 21:54:34 +02:00
def _parse_date(text: str) -> DTDate:
# Method for validating metadata from the PDF against the request data
2019-03-27 17:00:04 +01:00
text = text.split(': ')[1].rstrip()
2019-05-04 21:54:34 +02:00
return DTDateTime.strptime(text, '%d.%m.%Y').date()
2019-03-27 17:00:04 +01:00
2019-06-21 22:16:16 +02:00
2019-03-27 17:00:04 +01:00
def _array_remove_empty(obj: list) -> List[str]:
# just a macro for removing empty or empty-string array objects
2019-03-27 17:00:04 +01:00
try:
while True:
obj.remove('')
except ValueError:
return obj
return obj
2019-06-21 22:16:16 +02:00
2020-07-18 20:22:33 +02:00
def _parse_line(line: str, ctx: CurrencyResult) -> Rate or None:
2019-03-27 17:00:04 +01:00
arr = line.split(" ") # 3 spaces = minimum separation in PDF
arr = _array_remove_empty(arr)
# process currency name
names = arr[0].split(" ", 1)
rate = Rate(
abbr=names[0],
full_name=names[1].strip("()"),
ask=_parse_rate(arr[1]),
2020-07-18 20:22:33 +02:00
bid=_parse_rate(arr[2]),
date=ctx.date
2019-03-27 17:00:04 +01:00
)
return rate
def get_results_from_text(text: str, currency: str = None) -> CurrencyResult:
2020-07-18 20:22:33 +02:00
rates = OrderedDict()
2019-03-27 17:00:04 +01:00
result = CurrencyResult()
lines = text.splitlines()
# skip intro lines
lines = lines[2:]
# card type
result.card_type = _parse_card_type(lines.pop(0))
# get date
result.date = _parse_date(lines.pop(0))
# skip more lines
lines = lines[4:]
# now the rates begin
if currency is None:
for line in lines:
2020-07-18 20:22:33 +02:00
line_result = _parse_line(line, result)
2019-03-27 17:00:04 +01:00
rates[line_result.abbr] = line_result
else:
pattern = re.compile("^"+currency)
for line in lines:
if pattern.match(line):
2020-07-18 20:22:33 +02:00
line_result = _parse_line(line, result)
2019-03-27 17:00:04 +01:00
rates[line_result.abbr] = line_result
result.rates = rates
return result
2019-06-21 22:16:16 +02:00
2019-03-27 17:00:04 +01:00
def get_results_from_pdf(buf: BinaryIO or str, currency: str = None) -> CurrencyResult:
2020-07-18 20:22:33 +02:00
print('Parsing data... ', end='', file=stderr)
2019-03-27 17:00:04 +01:00
reader = PyPDF3.PdfFileReader(buf)
text = str()
for num in range(0, reader.getNumPages()-1):
2019-05-08 11:15:55 +02:00
text += reader.getPage(num).extractText()
2020-07-18 20:22:33 +02:00
print('Done.', file=stderr)
2019-03-27 17:00:04 +01:00
return get_results_from_text(text, currency=currency)
2019-06-21 22:16:16 +02:00
def get_fileio(date: DTDate, card_type: List[str] = CARD_VISA) -> BinaryIO: # pylint: disable=dangerous-default-value
2019-05-04 21:54:34 +02:00
# pylint: disable=no-member # mechanize.Browser has some lazy-loading methods that pylint doesn't see
2020-07-24 09:28:44 +02:00
print('Downloading rates for ' + date.strftime('%Y-%m-%d') + '... ', end='', file=stderr)
b = m.Browser()
# Firefox 64 User-Agent
# ua = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0'
# b.set_header('User-Agent', ua)
# Ignore robots.txt
# b.set_handle_robots(False)
# Debugging flags
# b.set_debug_http(True)
# b.set_debug_redirects(True)
# b.set_debug_responses(True)
# PDF URL
b.open('https://misc.firstdata.eu/CurrencyCalculator/fremdwaehrungskurse/pdf')
fm = b.forms()[0]
# This must be done because I can't change the options otherwise
fm.set_all_readonly(False)
# Configure form
fm['creditCardsRadio'] = card_type
fm['selectedDatesString'] = str(date.strftime('%Y%m%d') + ',')
# Retrieve file using button click; the button is 115x21 pixels in size.
# The API apparently doesn't like the max values
rq = fm.click(name='submitButton', coord=(random.randint(1, 114), random.randint(1, 20)))
rq.add_header('Accept', '*/*')
rp = b.retrieve(rq)
print(' Done.')
# Returns an open file-like object with the PDF as contents
return open(rp[0], 'rb')
2019-06-21 22:16:16 +02:00
def get_date() -> DTDate:
# For Sunday and Monday, use Friday's data; Saturday and Sunday are completely null
if DTDate.today().weekday() in [6, 0]:
date = DTDate.today() + relativedelta(weekday=FR(-1))
else:
# For all other days, the previous day is fine
date = DTDate.today() - DTTimeDelta(1)
return date
2019-06-21 22:16:16 +02:00
def mk_filename(date: DTDate, card_type: List[str]) -> str:
# List[str] is used because I don't want to make a class for just this
if card_type == CARD_MASTERCARD:
fn = date.isoformat() + '_MC.pdf'
elif card_type == CARD_VISA:
fn = date.isoformat() + '_VISA.pdf'
else:
raise TypeError("not a valid card type")
return fn
2020-07-18 20:22:33 +02:00