refactor extraction date from string

This commit is contained in:
lelo 2025-06-13 21:11:22 +00:00
parent 04e6d80af4
commit 31ebdfe9dd

View File

@ -1,54 +1,71 @@
from flask import session from flask import session
from datetime import datetime
import re import re
import os import os
import sqlite3 import sqlite3
from datetime import datetime, timedelta from datetime import datetime, timedelta
from typing import Optional
log_db = sqlite3.connect("access_log.db", check_same_thread=False) log_db = sqlite3.connect("access_log.db", check_same_thread=False)
def extract_date_from_string(string_with_date):
# grab X.Y.Z where X,Y,Z are 14 digits
m = re.search(r'(\d{1,4}\.\d{1,2}\.\d{1,4})', string_with_date) # Precompiled regex to find date-like patterns: either dotted X.Y.Z or ISO dashed YYYY-MM-DD
if not m: _DATE_REGEX = re.compile(
r"(" # start group
r"\d{4}-\d{1,2}-\d{1,2}" # ISO: YYYY-M-D or YYYY-MM-DD
r"|" # or
r"\d{1,4}\.\d{1,2}\.\d{1,4}" # dotted: X.Y.Z, where each is 14 digits (year may be 14)
r")"
)
def _try_parse(date_str: str, fmt: str) -> Optional[datetime]:
"""Try to parse date_str with fmt, return datetime or None."""
try:
return datetime.strptime(date_str, fmt)
except ValueError:
return None return None
date_str = m.group(1)
parts = date_str.split('.')
# 1) Unambiguous “last group = YYYY” def extract_date_from_string(text: str) -> Optional[str]:
if len(parts) == 3 and len(parts[2]) == 4: """
fmt = '%d.%m.%Y' Extract the first date-like substring from text and return it in ISO format (YYYY-MM-DD).
Supports:
- ISO-style dates with dashes (YYYY-M-D or YYYY-MM-DD)
- Dotted dates (DD.MM.YYYY, YYYY.MM.DD, DD.MM.YY, YY.MM.DD)
"""
match = _DATE_REGEX.search(text)
if not match:
return None
# 2) Unambiguous “first group = YYYY” date_str = match.group(1)
elif len(parts) == 3 and len(parts[0]) == 4:
fmt = '%Y.%m.%d'
# 3) Ambiguous “XX.XX.XX” → prefer DD.MM.YY, fallback to YY.MM.DD # 1) ISO dashed format takes priority
elif len(parts) == 3 and all(len(p) == 2 for p in parts):
# try last-group-as-year first
try:
dt = datetime.strptime(date_str, '%d.%m.%y')
return dt.strftime('%Y-%m-%d')
except ValueError:
# fallback to first-group-as-year
fmt = '%y.%m.%d'
else:
# optional: handle ISO with dashes
if '-' in date_str: if '-' in date_str:
try: dt = _try_parse(date_str, '%Y-%m-%d')
dt = datetime.strptime(date_str, '%Y-%m-%d') return dt.strftime('%Y-%m-%d') if dt else None
return dt.strftime('%Y-%m-%d')
except ValueError:
return None
return None
# parse with whichever fmt we settled on # 2) Dotted formats
try: parts = date_str.split('.')
dt = datetime.strptime(date_str, fmt) candidates = []
# Unambiguous: last part 4 digits → DD.MM.YYYY
if len(parts) == 3 and len(parts[2]) == 4:
candidates.append('%d.%m.%Y')
# Unambiguous: first part 4 digits → YYYY.MM.DD
if len(parts) == 3 and len(parts[0]) == 4:
candidates.append('%Y.%m.%d')
# Ambiguous two-digit groups: try DD.MM.YY, then YY.MM.DD
if len(parts) == 3 and all(len(p) == 2 for p in parts):
candidates.extend(['%d.%m.%y', '%y.%m.%d'])
# Try each candidate
for fmt in candidates:
dt = _try_parse(date_str, fmt)
if dt:
return dt.strftime('%Y-%m-%d') return dt.strftime('%Y-%m-%d')
except ValueError:
# no valid parse
return None return None