first-data-crawler-pdf/utils.py

101 lines
2.7 KiB
Python
Raw Normal View History

2019-03-27 17:00:04 +01:00
#!/usr/bin/env python3
# pylint: disable=missing-docstring,invalid-name
#import argparse
from collections import namedtuple
import datetime
import re
from typing import List, BinaryIO
import PyPDF3
Rate = namedtuple('Rate', ['abbr', 'full_name', 'ask', 'bid'])
class CurrencyResult():
def __init__(self):
self.rates = list()
self.card_type = str()
self.date = None
def _select_date():
date = datetime.date.today()-datetime.timedelta(1)
if datetime.date().isoweekday() in [6, 0]:
date = datetime.date.today()
def _parse_rate(text: str) -> float or None:
if re.match('Keine Kursdaten vorhanden', text):
_r = None
else:
text = text.strip(' ').replace(',', '.')
try:
_r = float(text)
except ValueError:
_r = None
return _r
def _parse_card_type(text: str) -> str:
text = text.split(':')[1]
text = text.strip('" ')
return text
def _parse_date(text: str) -> datetime.date:
text = text.split(': ')[1].rstrip()
return datetime.datetime.strptime(text, '%d.%m.%Y').date()
def _array_remove_empty(obj: list) -> List[str]:
try:
while True:
obj.remove('')
except ValueError:
return obj
return obj
def _parse_line(line: str) -> Rate or None:
arr = line.split(" ") # 3 spaces = minimum separation in PDF
arr = _array_remove_empty(arr)
# process currency name
names = arr[0].split(" ", 1)
rate = Rate(
abbr=names[0],
full_name=names[1].strip("()"),
ask=_parse_rate(arr[1]),
bid=_parse_rate(arr[2])
)
return rate
def get_results_from_text(text: str, currency: str = None) -> CurrencyResult:
rates = {}
result = CurrencyResult()
lines = text.splitlines()
# skip intro lines
lines = lines[2:]
# card type
result.card_type = _parse_card_type(lines.pop(0))
# get date
result.date = _parse_date(lines.pop(0))
# skip more lines
lines = lines[4:]
# now the rates begin
if currency is None:
for line in lines:
line_result = _parse_line(line)
rates[line_result.abbr] = line_result
else:
pattern = re.compile("^"+currency)
for line in lines:
if pattern.match(line):
line_result = _parse_line(line)
rates[line_result.abbr] = line_result
result.rates = rates
return result
def get_results_from_pdf(buf: BinaryIO or str, currency: str = None) -> CurrencyResult:
reader = PyPDF3.PdfFileReader(buf)
text = str()
pages = []
for num in range(0, reader.getNumPages()-1):
pages.append(reader.getPage(num))
for page in pages:
text += page.extractText()
return get_results_from_text(text, currency=currency)