refactor extraction date from string
This commit is contained in:
parent
04e6d80af4
commit
31ebdfe9dd
@ -1,57 +1,74 @@
|
|||||||
from flask import session
|
from flask import session
|
||||||
from datetime import datetime
|
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
import sqlite3
|
import sqlite3
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
log_db = sqlite3.connect("access_log.db", check_same_thread=False)
|
log_db = sqlite3.connect("access_log.db", check_same_thread=False)
|
||||||
|
|
||||||
def extract_date_from_string(string_with_date):
|
|
||||||
# grab X.Y.Z where X,Y,Z are 1–4 digits
|
|
||||||
m = re.search(r'(\d{1,4}\.\d{1,2}\.\d{1,4})', string_with_date)
|
|
||||||
if not m:
|
|
||||||
return None
|
|
||||||
|
|
||||||
date_str = m.group(1)
|
|
||||||
parts = date_str.split('.')
|
|
||||||
|
|
||||||
# 1) Unambiguous “last group = YYYY”
|
|
||||||
if len(parts) == 3 and len(parts[2]) == 4:
|
|
||||||
fmt = '%d.%m.%Y'
|
|
||||||
|
|
||||||
# 2) Unambiguous “first group = YYYY”
|
# Precompiled regex to find date-like patterns: either dotted X.Y.Z or ISO dashed YYYY-MM-DD
|
||||||
elif len(parts) == 3 and len(parts[0]) == 4:
|
_DATE_REGEX = re.compile(
|
||||||
fmt = '%Y.%m.%d'
|
r"(" # start group
|
||||||
|
r"\d{4}-\d{1,2}-\d{1,2}" # ISO: YYYY-M-D or YYYY-MM-DD
|
||||||
|
r"|" # or
|
||||||
|
r"\d{1,4}\.\d{1,2}\.\d{1,4}" # dotted: X.Y.Z, where each is 1–4 digits (year may be 1–4)
|
||||||
|
r")"
|
||||||
|
)
|
||||||
|
|
||||||
# 3) Ambiguous “XX.XX.XX” → prefer DD.MM.YY, fallback to YY.MM.DD
|
|
||||||
elif len(parts) == 3 and all(len(p) == 2 for p in parts):
|
|
||||||
# try last-group-as-year first
|
|
||||||
try:
|
|
||||||
dt = datetime.strptime(date_str, '%d.%m.%y')
|
|
||||||
return dt.strftime('%Y-%m-%d')
|
|
||||||
except ValueError:
|
|
||||||
# fallback to first-group-as-year
|
|
||||||
fmt = '%y.%m.%d'
|
|
||||||
|
|
||||||
else:
|
def _try_parse(date_str: str, fmt: str) -> Optional[datetime]:
|
||||||
# optional: handle ISO with dashes
|
"""Try to parse date_str with fmt, return datetime or None."""
|
||||||
if '-' in date_str:
|
|
||||||
try:
|
|
||||||
dt = datetime.strptime(date_str, '%Y-%m-%d')
|
|
||||||
return dt.strftime('%Y-%m-%d')
|
|
||||||
except ValueError:
|
|
||||||
return None
|
|
||||||
return None
|
|
||||||
|
|
||||||
# parse with whichever fmt we settled on
|
|
||||||
try:
|
try:
|
||||||
dt = datetime.strptime(date_str, fmt)
|
return datetime.strptime(date_str, fmt)
|
||||||
return dt.strftime('%Y-%m-%d')
|
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_date_from_string(text: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Extract the first date-like substring from text and return it in ISO format (YYYY-MM-DD).
|
||||||
|
Supports:
|
||||||
|
- ISO-style dates with dashes (YYYY-M-D or YYYY-MM-DD)
|
||||||
|
- Dotted dates (DD.MM.YYYY, YYYY.MM.DD, DD.MM.YY, YY.MM.DD)
|
||||||
|
"""
|
||||||
|
match = _DATE_REGEX.search(text)
|
||||||
|
if not match:
|
||||||
|
return None
|
||||||
|
|
||||||
|
date_str = match.group(1)
|
||||||
|
|
||||||
|
# 1) ISO dashed format takes priority
|
||||||
|
if '-' in date_str:
|
||||||
|
dt = _try_parse(date_str, '%Y-%m-%d')
|
||||||
|
return dt.strftime('%Y-%m-%d') if dt else None
|
||||||
|
|
||||||
|
# 2) Dotted formats
|
||||||
|
parts = date_str.split('.')
|
||||||
|
candidates = []
|
||||||
|
|
||||||
|
# Unambiguous: last part 4 digits → DD.MM.YYYY
|
||||||
|
if len(parts) == 3 and len(parts[2]) == 4:
|
||||||
|
candidates.append('%d.%m.%Y')
|
||||||
|
# Unambiguous: first part 4 digits → YYYY.MM.DD
|
||||||
|
if len(parts) == 3 and len(parts[0]) == 4:
|
||||||
|
candidates.append('%Y.%m.%d')
|
||||||
|
# Ambiguous two-digit groups: try DD.MM.YY, then YY.MM.DD
|
||||||
|
if len(parts) == 3 and all(len(p) == 2 for p in parts):
|
||||||
|
candidates.extend(['%d.%m.%y', '%y.%m.%d'])
|
||||||
|
|
||||||
|
# Try each candidate
|
||||||
|
for fmt in candidates:
|
||||||
|
dt = _try_parse(date_str, fmt)
|
||||||
|
if dt:
|
||||||
|
return dt.strftime('%Y-%m-%d')
|
||||||
|
|
||||||
|
# no valid parse
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def extract_structure_from_string(input_string):
|
def extract_structure_from_string(input_string):
|
||||||
# extract category and titel from filename
|
# extract category and titel from filename
|
||||||
filepathname_ext = os.path.splitext(input_string)[0] # remove file extension
|
filepathname_ext = os.path.splitext(input_string)[0] # remove file extension
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user