refactor extraction date from string

This commit is contained in:
lelo 2025-06-13 21:11:22 +00:00
parent 04e6d80af4
commit 31ebdfe9dd

View File

@ -1,57 +1,74 @@
from flask import session from flask import session
from datetime import datetime
import re import re
import os import os
import sqlite3 import sqlite3
from datetime import datetime, timedelta from datetime import datetime, timedelta
from typing import Optional
log_db = sqlite3.connect("access_log.db", check_same_thread=False) log_db = sqlite3.connect("access_log.db", check_same_thread=False)
def extract_date_from_string(string_with_date):
# grab X.Y.Z where X,Y,Z are 14 digits
m = re.search(r'(\d{1,4}\.\d{1,2}\.\d{1,4})', string_with_date)
if not m:
return None
date_str = m.group(1)
parts = date_str.split('.')
# 1) Unambiguous “last group = YYYY”
if len(parts) == 3 and len(parts[2]) == 4:
fmt = '%d.%m.%Y'
# 2) Unambiguous “first group = YYYY” # Precompiled regex to find date-like patterns: either dotted X.Y.Z or ISO dashed YYYY-MM-DD
elif len(parts) == 3 and len(parts[0]) == 4: _DATE_REGEX = re.compile(
fmt = '%Y.%m.%d' r"(" # start group
r"\d{4}-\d{1,2}-\d{1,2}" # ISO: YYYY-M-D or YYYY-MM-DD
r"|" # or
r"\d{1,4}\.\d{1,2}\.\d{1,4}" # dotted: X.Y.Z, where each is 14 digits (year may be 14)
r")"
)
# 3) Ambiguous “XX.XX.XX” → prefer DD.MM.YY, fallback to YY.MM.DD
elif len(parts) == 3 and all(len(p) == 2 for p in parts):
# try last-group-as-year first
try:
dt = datetime.strptime(date_str, '%d.%m.%y')
return dt.strftime('%Y-%m-%d')
except ValueError:
# fallback to first-group-as-year
fmt = '%y.%m.%d'
else: def _try_parse(date_str: str, fmt: str) -> Optional[datetime]:
# optional: handle ISO with dashes """Try to parse date_str with fmt, return datetime or None."""
if '-' in date_str:
try:
dt = datetime.strptime(date_str, '%Y-%m-%d')
return dt.strftime('%Y-%m-%d')
except ValueError:
return None
return None
# parse with whichever fmt we settled on
try: try:
dt = datetime.strptime(date_str, fmt) return datetime.strptime(date_str, fmt)
return dt.strftime('%Y-%m-%d')
except ValueError: except ValueError:
return None return None
def extract_date_from_string(text: str) -> Optional[str]:
"""
Extract the first date-like substring from text and return it in ISO format (YYYY-MM-DD).
Supports:
- ISO-style dates with dashes (YYYY-M-D or YYYY-MM-DD)
- Dotted dates (DD.MM.YYYY, YYYY.MM.DD, DD.MM.YY, YY.MM.DD)
"""
match = _DATE_REGEX.search(text)
if not match:
return None
date_str = match.group(1)
# 1) ISO dashed format takes priority
if '-' in date_str:
dt = _try_parse(date_str, '%Y-%m-%d')
return dt.strftime('%Y-%m-%d') if dt else None
# 2) Dotted formats
parts = date_str.split('.')
candidates = []
# Unambiguous: last part 4 digits → DD.MM.YYYY
if len(parts) == 3 and len(parts[2]) == 4:
candidates.append('%d.%m.%Y')
# Unambiguous: first part 4 digits → YYYY.MM.DD
if len(parts) == 3 and len(parts[0]) == 4:
candidates.append('%Y.%m.%d')
# Ambiguous two-digit groups: try DD.MM.YY, then YY.MM.DD
if len(parts) == 3 and all(len(p) == 2 for p in parts):
candidates.extend(['%d.%m.%y', '%y.%m.%d'])
# Try each candidate
for fmt in candidates:
dt = _try_parse(date_str, fmt)
if dt:
return dt.strftime('%Y-%m-%d')
# no valid parse
return None
def extract_structure_from_string(input_string): def extract_structure_from_string(input_string):
# extract category and titel from filename # extract category and titel from filename
filepathname_ext = os.path.splitext(input_string)[0] # remove file extension filepathname_ext = os.path.splitext(input_string)[0] # remove file extension