first-data-crawler-pdf/utils.py

143 lines
4.2 KiB
Python
Raw Normal View History

2019-03-27 17:00:04 +01:00
#!/usr/bin/env python3
# pylint: disable=missing-docstring,invalid-name
#import argparse
from collections import namedtuple
import datetime
import re
from typing import List, BinaryIO
from datetime import date as DTDate
from datetime import timedelta as DTTimeDelta
2019-03-27 17:00:04 +01:00
import PyPDF3
import mechanize as m
import random
2019-03-27 17:00:04 +01:00
Rate = namedtuple('Rate', ['abbr', 'full_name', 'ask', 'bid'])
# Constants
CARD_MASTERCARD = ['0']
CARD_VISA = ['1']
2019-03-27 17:00:04 +01:00
class CurrencyResult():
def __init__(self):
self.rates = list()
self.card_type = str()
self.date = None
def _select_date():
date = datetime.date.today()-datetime.timedelta(1)
if datetime.date().isoweekday() in [6, 0]:
date = datetime.date.today()
def _parse_rate(text: str) -> float or None:
if re.match('Keine Kursdaten vorhanden', text):
_r = None
else:
text = text.strip(' ').replace(',', '.')
try:
_r = float(text)
except ValueError:
_r = None
return _r
def _parse_card_type(text: str) -> str:
text = text.split(':')[1]
text = text.strip('" ')
return text
def _parse_date(text: str) -> datetime.date:
text = text.split(': ')[1].rstrip()
return datetime.datetime.strptime(text, '%d.%m.%Y').date()
def _array_remove_empty(obj: list) -> List[str]:
try:
while True:
obj.remove('')
except ValueError:
return obj
return obj
def _parse_line(line: str) -> Rate or None:
arr = line.split(" ") # 3 spaces = minimum separation in PDF
arr = _array_remove_empty(arr)
# process currency name
names = arr[0].split(" ", 1)
rate = Rate(
abbr=names[0],
full_name=names[1].strip("()"),
ask=_parse_rate(arr[1]),
bid=_parse_rate(arr[2])
)
return rate
def get_results_from_text(text: str, currency: str = None) -> CurrencyResult:
rates = {}
result = CurrencyResult()
lines = text.splitlines()
# skip intro lines
lines = lines[2:]
# card type
result.card_type = _parse_card_type(lines.pop(0))
# get date
result.date = _parse_date(lines.pop(0))
# skip more lines
lines = lines[4:]
# now the rates begin
if currency is None:
for line in lines:
line_result = _parse_line(line)
rates[line_result.abbr] = line_result
else:
pattern = re.compile("^"+currency)
for line in lines:
if pattern.match(line):
line_result = _parse_line(line)
rates[line_result.abbr] = line_result
result.rates = rates
return result
def get_results_from_pdf(buf: BinaryIO or str, currency: str = None) -> CurrencyResult:
reader = PyPDF3.PdfFileReader(buf)
text = str()
pages = []
for num in range(0, reader.getNumPages()-1):
pages.append(reader.getPage(num))
for page in pages:
text += page.extractText()
return get_results_from_text(text, currency=currency)
def get_fileio(date: DTDate, card_type: List[str] = CARD_VISA) -> BinaryIO: # pylint: disable=dangerous-default-value
print('Downloading rates for ' + date.strftime('%Y-%m-%d') + '... ', end='')
b = m.Browser()
# Firefox 64 User-Agent
# ua = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0'
# b.set_header('User-Agent', ua)
# Ignore robots.txt
# b.set_handle_robots(False)
# Debugging flags
# b.set_debug_http(True)
# b.set_debug_redirects(True)
# b.set_debug_responses(True)
# PDF URL
b.open('https://misc.firstdata.eu/CurrencyCalculator/fremdwaehrungskurse/pdf')
fm = b.forms()
fm_i = fm[0]
fm_i.set_all_readonly(False)
# Configure form
fm_i['creditCardsRadio'] = card_type # VISA
fm_i['selectedDatesString'] = str(date.strftime('%Y%m%d') + ',')
# Retrieve file using button click
rq = fm_i.click(name='submitButton', coord=(random.randint(1, 119), random.randint(1, 20)))
rq.add_header('Accept', '*/*')
rp = b.retrieve(rq)
print(' Done.')
return open(rp[0], 'rb')
def get_date() -> DTDate:
if DTDate.today().weekday() in [6, 0]:
date = DTDate.today() + relativedelta(weekday=FR(-1))
else:
date = DTDate.today() - DTTimeDelta(1)
return date