advanced date extraction
This commit is contained in:
parent
688625f451
commit
923693ed9a
@ -69,6 +69,52 @@ def get_hit_count(relative_path):
|
||||
row = cursor.fetchone()
|
||||
return row["hit_count"] if row else 0
|
||||
|
||||
|
||||
def extract_date_from_string(string_with_date):
|
||||
# grab X.Y.Z where X,Y,Z are 1–4 digits
|
||||
m = re.search(r'(\d{1,4}\.\d{1,2}\.\d{1,4})', string_with_date)
|
||||
if not m:
|
||||
return None
|
||||
|
||||
date_str = m.group(1)
|
||||
parts = date_str.split('.')
|
||||
|
||||
# 1) Unambiguous “last group = YYYY”
|
||||
if len(parts) == 3 and len(parts[2]) == 4:
|
||||
fmt = '%d.%m.%Y'
|
||||
|
||||
# 2) Unambiguous “first group = YYYY”
|
||||
elif len(parts) == 3 and len(parts[0]) == 4:
|
||||
fmt = '%Y.%m.%d'
|
||||
|
||||
# 3) Ambiguous “XX.XX.XX” → prefer DD.MM.YY, fallback to YY.MM.DD
|
||||
elif len(parts) == 3 and all(len(p) == 2 for p in parts):
|
||||
# try last-group-as-year first
|
||||
try:
|
||||
dt = datetime.strptime(date_str, '%d.%m.%y')
|
||||
return dt.strftime('%Y-%m-%d')
|
||||
except ValueError:
|
||||
# fallback to first-group-as-year
|
||||
fmt = '%y.%m.%d'
|
||||
|
||||
else:
|
||||
# optional: handle ISO with dashes
|
||||
if '-' in date_str:
|
||||
try:
|
||||
dt = datetime.strptime(date_str, '%Y-%m-%d')
|
||||
return dt.strftime('%Y-%m-%d')
|
||||
except ValueError:
|
||||
return None
|
||||
return None
|
||||
|
||||
# parse with whichever fmt we settled on
|
||||
try:
|
||||
dt = datetime.strptime(date_str, fmt)
|
||||
return dt.strftime('%Y-%m-%d')
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def updatefileindex():
|
||||
cursor = search_db.cursor()
|
||||
|
||||
@ -169,21 +215,7 @@ def updatefileindex():
|
||||
titel = None
|
||||
name = None
|
||||
|
||||
# extract the date from path using regex (supports YYYY.MM.DD, DD.MM.YYYY or DD.MM.YY)
|
||||
date_match = re.search(r'(\d{1,2}\.\d{1,2}\.\d{2,4}|\d{4}\.\d{2}\.\d{2})', relative_path)
|
||||
if date_match:
|
||||
date_str = date_match.group(1)
|
||||
performance_date = None
|
||||
for fmt in ('%Y.%m.%d', '%d.%m.%Y', '%d.%m.%y', '%Y-%m-%d'):
|
||||
try:
|
||||
date_obj = datetime.strptime(date_str, fmt)
|
||||
# Convert to ISO format YYYY-MM-DD
|
||||
performance_date = date_obj.strftime('%Y-%m-%d')
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
else:
|
||||
performance_date = None
|
||||
performance_date = extract_date_from_string(relative_path)
|
||||
|
||||
scanned_files.append((relative_path, foldername, entry.name, filetype, category, titel, name, performance_date, site, transcript, hit_count))
|
||||
current_keys.add((relative_path, entry.name))
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user