diff --git a/helperfunctions.py b/helperfunctions.py index b81888d..e390f68 100644 --- a/helperfunctions.py +++ b/helperfunctions.py @@ -1,57 +1,74 @@ from flask import session -from datetime import datetime import re import os import sqlite3 from datetime import datetime, timedelta +from typing import Optional log_db = sqlite3.connect("access_log.db", check_same_thread=False) -def extract_date_from_string(string_with_date): - # grab X.Y.Z where X,Y,Z are 1–4 digits - m = re.search(r'(\d{1,4}\.\d{1,2}\.\d{1,4})', string_with_date) - if not m: - return None - date_str = m.group(1) - parts = date_str.split('.') - - # 1) Unambiguous “last group = YYYY” - if len(parts) == 3 and len(parts[2]) == 4: - fmt = '%d.%m.%Y' - # 2) Unambiguous “first group = YYYY” - elif len(parts) == 3 and len(parts[0]) == 4: - fmt = '%Y.%m.%d' +# Precompiled regex to find date-like patterns: either dotted X.Y.Z or ISO dashed YYYY-MM-DD +_DATE_REGEX = re.compile( + r"(" # start group + r"\d{4}-\d{1,2}-\d{1,2}" # ISO: YYYY-M-D or YYYY-MM-DD + r"|" # or + r"\d{1,4}\.\d{1,2}\.\d{1,4}" # dotted: X.Y.Z, where each is 1–4 digits (year may be 1–4) + r")" +) - # 3) Ambiguous “XX.XX.XX” → prefer DD.MM.YY, fallback to YY.MM.DD - elif len(parts) == 3 and all(len(p) == 2 for p in parts): - # try last-group-as-year first - try: - dt = datetime.strptime(date_str, '%d.%m.%y') - return dt.strftime('%Y-%m-%d') - except ValueError: - # fallback to first-group-as-year - fmt = '%y.%m.%d' - else: - # optional: handle ISO with dashes - if '-' in date_str: - try: - dt = datetime.strptime(date_str, '%Y-%m-%d') - return dt.strftime('%Y-%m-%d') - except ValueError: - return None - return None - - # parse with whichever fmt we settled on +def _try_parse(date_str: str, fmt: str) -> Optional[datetime]: + """Try to parse date_str with fmt, return datetime or None.""" try: - dt = datetime.strptime(date_str, fmt) - return dt.strftime('%Y-%m-%d') + return datetime.strptime(date_str, fmt) except ValueError: return None +def extract_date_from_string(text: str) -> Optional[str]: + """ + Extract the first date-like substring from text and return it in ISO format (YYYY-MM-DD). + Supports: + - ISO-style dates with dashes (YYYY-M-D or YYYY-MM-DD) + - Dotted dates (DD.MM.YYYY, YYYY.MM.DD, DD.MM.YY, YY.MM.DD) + """ + match = _DATE_REGEX.search(text) + if not match: + return None + + date_str = match.group(1) + + # 1) ISO dashed format takes priority + if '-' in date_str: + dt = _try_parse(date_str, '%Y-%m-%d') + return dt.strftime('%Y-%m-%d') if dt else None + + # 2) Dotted formats + parts = date_str.split('.') + candidates = [] + + # Unambiguous: last part 4 digits → DD.MM.YYYY + if len(parts) == 3 and len(parts[2]) == 4: + candidates.append('%d.%m.%Y') + # Unambiguous: first part 4 digits → YYYY.MM.DD + if len(parts) == 3 and len(parts[0]) == 4: + candidates.append('%Y.%m.%d') + # Ambiguous two-digit groups: try DD.MM.YY, then YY.MM.DD + if len(parts) == 3 and all(len(p) == 2 for p in parts): + candidates.extend(['%d.%m.%y', '%y.%m.%d']) + + # Try each candidate + for fmt in candidates: + dt = _try_parse(date_str, fmt) + if dt: + return dt.strftime('%Y-%m-%d') + + # no valid parse + return None + + def extract_structure_from_string(input_string): # extract category and titel from filename filepathname_ext = os.path.splitext(input_string)[0] # remove file extension