first-data-crawler-pdf/utils.py

143 lines
4.2 KiB
Python

#!/usr/bin/env python3
# pylint: disable=missing-docstring,invalid-name
#import argparse
from collections import namedtuple
import datetime
import re
from typing import List, BinaryIO
from datetime import date as DTDate
from datetime import timedelta as DTTimeDelta
import PyPDF3
import mechanize as m
import random
Rate = namedtuple('Rate', ['abbr', 'full_name', 'ask', 'bid'])
# Constants
CARD_MASTERCARD = ['0']
CARD_VISA = ['1']
class CurrencyResult():
def __init__(self):
self.rates = list()
self.card_type = str()
self.date = None
def _select_date():
date = datetime.date.today()-datetime.timedelta(1)
if datetime.date().isoweekday() in [6, 0]:
date = datetime.date.today()
def _parse_rate(text: str) -> float or None:
if re.match('Keine Kursdaten vorhanden', text):
_r = None
else:
text = text.strip(' ').replace(',', '.')
try:
_r = float(text)
except ValueError:
_r = None
return _r
def _parse_card_type(text: str) -> str:
text = text.split(':')[1]
text = text.strip('" ')
return text
def _parse_date(text: str) -> datetime.date:
text = text.split(': ')[1].rstrip()
return datetime.datetime.strptime(text, '%d.%m.%Y').date()
def _array_remove_empty(obj: list) -> List[str]:
try:
while True:
obj.remove('')
except ValueError:
return obj
return obj
def _parse_line(line: str) -> Rate or None:
arr = line.split(" ") # 3 spaces = minimum separation in PDF
arr = _array_remove_empty(arr)
# process currency name
names = arr[0].split(" ", 1)
rate = Rate(
abbr=names[0],
full_name=names[1].strip("()"),
ask=_parse_rate(arr[1]),
bid=_parse_rate(arr[2])
)
return rate
def get_results_from_text(text: str, currency: str = None) -> CurrencyResult:
rates = {}
result = CurrencyResult()
lines = text.splitlines()
# skip intro lines
lines = lines[2:]
# card type
result.card_type = _parse_card_type(lines.pop(0))
# get date
result.date = _parse_date(lines.pop(0))
# skip more lines
lines = lines[4:]
# now the rates begin
if currency is None:
for line in lines:
line_result = _parse_line(line)
rates[line_result.abbr] = line_result
else:
pattern = re.compile("^"+currency)
for line in lines:
if pattern.match(line):
line_result = _parse_line(line)
rates[line_result.abbr] = line_result
result.rates = rates
return result
def get_results_from_pdf(buf: BinaryIO or str, currency: str = None) -> CurrencyResult:
reader = PyPDF3.PdfFileReader(buf)
text = str()
pages = []
for num in range(0, reader.getNumPages()-1):
pages.append(reader.getPage(num))
for page in pages:
text += page.extractText()
return get_results_from_text(text, currency=currency)
def get_fileio(date: DTDate, card_type: List[str] = CARD_VISA) -> BinaryIO: # pylint: disable=dangerous-default-value
print('Downloading rates for ' + date.strftime('%Y-%m-%d') + '... ', end='')
b = m.Browser()
# Firefox 64 User-Agent
# ua = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0'
# b.set_header('User-Agent', ua)
# Ignore robots.txt
# b.set_handle_robots(False)
# Debugging flags
# b.set_debug_http(True)
# b.set_debug_redirects(True)
# b.set_debug_responses(True)
# PDF URL
b.open('https://misc.firstdata.eu/CurrencyCalculator/fremdwaehrungskurse/pdf')
fm = b.forms()
fm_i = fm[0]
fm_i.set_all_readonly(False)
# Configure form
fm_i['creditCardsRadio'] = card_type # VISA
fm_i['selectedDatesString'] = str(date.strftime('%Y%m%d') + ',')
# Retrieve file using button click
rq = fm_i.click(name='submitButton', coord=(random.randint(1, 119), random.randint(1, 20)))
rq.add_header('Accept', '*/*')
rp = b.retrieve(rq)
print(' Done.')
return open(rp[0], 'rb')
def get_date() -> DTDate:
if DTDate.today().weekday() in [6, 0]:
date = DTDate.today() + relativedelta(weekday=FR(-1))
else:
date = DTDate.today() - DTTimeDelta(1)
return date