From eec9deefadd3830aa2865616f7b4eea374ee521a Mon Sep 17 00:00:00 2001 From: lelo Date: Sat, 12 Apr 2025 19:49:23 +0000 Subject: [PATCH] improve indexing --- index_for_search.py | 76 +++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 33 deletions(-) diff --git a/index_for_search.py b/index_for_search.py index ef9deec..6a157ed 100755 --- a/index_for_search.py +++ b/index_for_search.py @@ -128,36 +128,46 @@ def updatefileindex(): # Prepend the foldername so it becomes part of the stored relative path. relative_path = os.path.join(foldername, rel_part).replace(os.sep, '/') filetype = os.path.splitext(entry.name)[1].lower() - transcript = None - - # Check for a corresponding transcript file in a sibling "Transkription" folder. - parent_dir = os.path.dirname(entry_path) - transcript_dir = os.path.join(parent_dir, "Transkription") - transcript_filename = os.path.splitext(entry.name)[0] + ".md" - transcript_path = os.path.join(transcript_dir, transcript_filename) - if os.path.exists(transcript_path): - try: - with open(transcript_path, 'r', encoding='utf-8') as tf: - transcript = tf.read() - except Exception: - transcript = None - + # Retrieve the hit count for this file. hit_count = get_hit_count(relative_path) category, titel, name, performance_date, site = None, None, None, None, None + # Determine the site + if foldername == 'Gottesdienste Speyer': + site = 'Speyer' + elif foldername == 'Gottesdienste Schwegenheim': + site = 'Schwegenheim' + if filetype == '.mp3': - # Determine the site - if foldername == 'Gottesdienste Speyer': - site = 'Speyer' - elif foldername == 'Gottesdienste Schwegenheim': - site = 'Schwegenheim' + transcript = None + + # Check for a corresponding transcript file in a sibling "Transkription" folder. + parent_dir = os.path.dirname(entry_path) + transcript_dir = os.path.join(parent_dir, "Transkription") + transcript_filename = os.path.splitext(entry.name)[0] + ".md" + transcript_path = os.path.join(transcript_dir, transcript_filename) + if os.path.exists(transcript_path): + try: + with open(transcript_path, 'r', encoding='utf-8') as tf: + transcript = tf.read() + except Exception: + transcript = None # extract category and titel from filename filename_ext = os.path.splitext(entry.name)[0] left_side, right_side = filename_ext.split('-', 1) if '-' in filename_ext else (filename_ext, None) - if 'predigt' in left_side.lower(): + try: + int(left_side.strip()) + # first part is only a number + previous_right_side = right_side + left_side, right_side = previous_right_side.split('-', 1) if '-' in previous_right_side else (previous_right_side, None) + except: + # first part not a number + continue + + if 'predig' in left_side.lower(): category = 'Predigt' elif 'wort' in left_side.lower() or 'einladung' in left_side.lower(): category = 'Vorwort' @@ -186,22 +196,22 @@ def updatefileindex(): titel = None name = None - # extract the date from path using regex (dd.mm.yyyy or dd.mm.yy) - date_match = re.search(r'(\d{1,2}\.\d{1,2}\.\d{2,4})', relative_path) - if date_match: - date_str = date_match.group(1) - # Convert to YYYY-MM-DD format + # extract the date from path using regex (dd.mm.yyyy or dd.mm.yy) + date_match = re.search(r'(\d{1,2}\.\d{1,2}\.\d{2,4})', relative_path) + if date_match: + date_str = date_match.group(1) + # Convert to YYYY-MM-DD format + try: + date_obj = datetime.strptime(date_str, '%d.%m.%Y') + performance_date = date_obj.strftime('%d.%m.%Y') + except ValueError: try: - date_obj = datetime.strptime(date_str, '%d.%m.%Y') + date_obj = datetime.strptime(date_str, '%d.%m.%y') performance_date = date_obj.strftime('%d.%m.%Y') except ValueError: - try: - date_obj = datetime.strptime(date_str, '%d.%m.%y') - performance_date = date_obj.strftime('%d.%m.%Y') - except ValueError: - performance_date = None - else: - performance_date = None + performance_date = None + else: + performance_date = None scanned_files.append((relative_path, foldername, entry.name, filetype, category, titel, name, performance_date, site, transcript, hit_count)) current_keys.add((relative_path, entry.name))