From 923693ed9a02181ed936e34ce220502672a3b1b6 Mon Sep 17 00:00:00 2001 From: lelo Date: Sun, 11 May 2025 12:50:05 +0200 Subject: [PATCH] advanced date extraction --- index_for_search.py | 62 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 47 insertions(+), 15 deletions(-) diff --git a/index_for_search.py b/index_for_search.py index 3d6a010..1ad7ff5 100755 --- a/index_for_search.py +++ b/index_for_search.py @@ -69,6 +69,52 @@ def get_hit_count(relative_path): row = cursor.fetchone() return row["hit_count"] if row else 0 + +def extract_date_from_string(string_with_date): + # grab X.Y.Z where X,Y,Z are 1–4 digits + m = re.search(r'(\d{1,4}\.\d{1,2}\.\d{1,4})', string_with_date) + if not m: + return None + + date_str = m.group(1) + parts = date_str.split('.') + + # 1) Unambiguous “last group = YYYY” + if len(parts) == 3 and len(parts[2]) == 4: + fmt = '%d.%m.%Y' + + # 2) Unambiguous “first group = YYYY” + elif len(parts) == 3 and len(parts[0]) == 4: + fmt = '%Y.%m.%d' + + # 3) Ambiguous “XX.XX.XX” → prefer DD.MM.YY, fallback to YY.MM.DD + elif len(parts) == 3 and all(len(p) == 2 for p in parts): + # try last-group-as-year first + try: + dt = datetime.strptime(date_str, '%d.%m.%y') + return dt.strftime('%Y-%m-%d') + except ValueError: + # fallback to first-group-as-year + fmt = '%y.%m.%d' + + else: + # optional: handle ISO with dashes + if '-' in date_str: + try: + dt = datetime.strptime(date_str, '%Y-%m-%d') + return dt.strftime('%Y-%m-%d') + except ValueError: + return None + return None + + # parse with whichever fmt we settled on + try: + dt = datetime.strptime(date_str, fmt) + return dt.strftime('%Y-%m-%d') + except ValueError: + return None + + def updatefileindex(): cursor = search_db.cursor() @@ -169,21 +215,7 @@ def updatefileindex(): titel = None name = None - # extract the date from path using regex (supports YYYY.MM.DD, DD.MM.YYYY or DD.MM.YY) - date_match = re.search(r'(\d{1,2}\.\d{1,2}\.\d{2,4}|\d{4}\.\d{2}\.\d{2})', relative_path) - if date_match: - date_str = date_match.group(1) - performance_date = None - for fmt in ('%Y.%m.%d', '%d.%m.%Y', '%d.%m.%y', '%Y-%m-%d'): - try: - date_obj = datetime.strptime(date_str, fmt) - # Convert to ISO format YYYY-MM-DD - performance_date = date_obj.strftime('%Y-%m-%d') - break - except ValueError: - continue - else: - performance_date = None + performance_date = extract_date_from_string(relative_path) scanned_files.append((relative_path, foldername, entry.name, filetype, category, titel, name, performance_date, site, transcript, hit_count)) current_keys.add((relative_path, entry.name))