advanced date extraction
This commit is contained in:
parent
688625f451
commit
923693ed9a
@ -69,6 +69,52 @@ def get_hit_count(relative_path):
|
|||||||
row = cursor.fetchone()
|
row = cursor.fetchone()
|
||||||
return row["hit_count"] if row else 0
|
return row["hit_count"] if row else 0
|
||||||
|
|
||||||
|
|
||||||
|
def extract_date_from_string(string_with_date):
|
||||||
|
# grab X.Y.Z where X,Y,Z are 1–4 digits
|
||||||
|
m = re.search(r'(\d{1,4}\.\d{1,2}\.\d{1,4})', string_with_date)
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
|
||||||
|
date_str = m.group(1)
|
||||||
|
parts = date_str.split('.')
|
||||||
|
|
||||||
|
# 1) Unambiguous “last group = YYYY”
|
||||||
|
if len(parts) == 3 and len(parts[2]) == 4:
|
||||||
|
fmt = '%d.%m.%Y'
|
||||||
|
|
||||||
|
# 2) Unambiguous “first group = YYYY”
|
||||||
|
elif len(parts) == 3 and len(parts[0]) == 4:
|
||||||
|
fmt = '%Y.%m.%d'
|
||||||
|
|
||||||
|
# 3) Ambiguous “XX.XX.XX” → prefer DD.MM.YY, fallback to YY.MM.DD
|
||||||
|
elif len(parts) == 3 and all(len(p) == 2 for p in parts):
|
||||||
|
# try last-group-as-year first
|
||||||
|
try:
|
||||||
|
dt = datetime.strptime(date_str, '%d.%m.%y')
|
||||||
|
return dt.strftime('%Y-%m-%d')
|
||||||
|
except ValueError:
|
||||||
|
# fallback to first-group-as-year
|
||||||
|
fmt = '%y.%m.%d'
|
||||||
|
|
||||||
|
else:
|
||||||
|
# optional: handle ISO with dashes
|
||||||
|
if '-' in date_str:
|
||||||
|
try:
|
||||||
|
dt = datetime.strptime(date_str, '%Y-%m-%d')
|
||||||
|
return dt.strftime('%Y-%m-%d')
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
# parse with whichever fmt we settled on
|
||||||
|
try:
|
||||||
|
dt = datetime.strptime(date_str, fmt)
|
||||||
|
return dt.strftime('%Y-%m-%d')
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def updatefileindex():
|
def updatefileindex():
|
||||||
cursor = search_db.cursor()
|
cursor = search_db.cursor()
|
||||||
|
|
||||||
@ -169,21 +215,7 @@ def updatefileindex():
|
|||||||
titel = None
|
titel = None
|
||||||
name = None
|
name = None
|
||||||
|
|
||||||
# extract the date from path using regex (supports YYYY.MM.DD, DD.MM.YYYY or DD.MM.YY)
|
performance_date = extract_date_from_string(relative_path)
|
||||||
date_match = re.search(r'(\d{1,2}\.\d{1,2}\.\d{2,4}|\d{4}\.\d{2}\.\d{2})', relative_path)
|
|
||||||
if date_match:
|
|
||||||
date_str = date_match.group(1)
|
|
||||||
performance_date = None
|
|
||||||
for fmt in ('%Y.%m.%d', '%d.%m.%Y', '%d.%m.%y', '%Y-%m-%d'):
|
|
||||||
try:
|
|
||||||
date_obj = datetime.strptime(date_str, fmt)
|
|
||||||
# Convert to ISO format YYYY-MM-DD
|
|
||||||
performance_date = date_obj.strftime('%Y-%m-%d')
|
|
||||||
break
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
performance_date = None
|
|
||||||
|
|
||||||
scanned_files.append((relative_path, foldername, entry.name, filetype, category, titel, name, performance_date, site, transcript, hit_count))
|
scanned_files.append((relative_path, foldername, entry.name, filetype, category, titel, name, performance_date, site, transcript, hit_count))
|
||||||
current_keys.add((relative_path, entry.name))
|
current_keys.add((relative_path, entry.name))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user