advanced date extraction

This commit is contained in:
lelo 2025-05-11 12:50:05 +02:00
parent 688625f451
commit 923693ed9a

View File

@ -69,6 +69,52 @@ def get_hit_count(relative_path):
row = cursor.fetchone()
return row["hit_count"] if row else 0
def extract_date_from_string(string_with_date):
# grab X.Y.Z where X,Y,Z are 14 digits
m = re.search(r'(\d{1,4}\.\d{1,2}\.\d{1,4})', string_with_date)
if not m:
return None
date_str = m.group(1)
parts = date_str.split('.')
# 1) Unambiguous “last group = YYYY”
if len(parts) == 3 and len(parts[2]) == 4:
fmt = '%d.%m.%Y'
# 2) Unambiguous “first group = YYYY”
elif len(parts) == 3 and len(parts[0]) == 4:
fmt = '%Y.%m.%d'
# 3) Ambiguous “XX.XX.XX” → prefer DD.MM.YY, fallback to YY.MM.DD
elif len(parts) == 3 and all(len(p) == 2 for p in parts):
# try last-group-as-year first
try:
dt = datetime.strptime(date_str, '%d.%m.%y')
return dt.strftime('%Y-%m-%d')
except ValueError:
# fallback to first-group-as-year
fmt = '%y.%m.%d'
else:
# optional: handle ISO with dashes
if '-' in date_str:
try:
dt = datetime.strptime(date_str, '%Y-%m-%d')
return dt.strftime('%Y-%m-%d')
except ValueError:
return None
return None
# parse with whichever fmt we settled on
try:
dt = datetime.strptime(date_str, fmt)
return dt.strftime('%Y-%m-%d')
except ValueError:
return None
def updatefileindex():
cursor = search_db.cursor()
@ -169,21 +215,7 @@ def updatefileindex():
titel = None
name = None
# extract the date from path using regex (supports YYYY.MM.DD, DD.MM.YYYY or DD.MM.YY)
date_match = re.search(r'(\d{1,2}\.\d{1,2}\.\d{2,4}|\d{4}\.\d{2}\.\d{2})', relative_path)
if date_match:
date_str = date_match.group(1)
performance_date = None
for fmt in ('%Y.%m.%d', '%d.%m.%Y', '%d.%m.%y', '%Y-%m-%d'):
try:
date_obj = datetime.strptime(date_str, fmt)
# Convert to ISO format YYYY-MM-DD
performance_date = date_obj.strftime('%Y-%m-%d')
break
except ValueError:
continue
else:
performance_date = None
performance_date = extract_date_from_string(relative_path)
scanned_files.append((relative_path, foldername, entry.name, filetype, category, titel, name, performance_date, site, transcript, hit_count))
current_keys.add((relative_path, entry.name))