import os import json import sqlite3 SEARCH_DB_NAME = 'search.db' search_db = sqlite3.connect(SEARCH_DB_NAME, check_same_thread=False) search_db.row_factory = sqlite3.Row def init_db(): """Initializes the database with the required schema.""" cursor = search_db.cursor() cursor.execute(''' CREATE TABLE IF NOT EXISTS files ( id INTEGER PRIMARY KEY AUTOINCREMENT, relative_path TEXT, filename TEXT, filetype TEXT, transcript TEXT, UNIQUE(relative_path, filename) ) ''') search_db.commit() def scan_dir(directory): """Recursively scan directories using os.scandir for improved performance.""" try: with os.scandir(directory) as it: for entry in it: if entry.is_dir(follow_symlinks=False): # Skip transcription directories immediately. if entry.name.lower() == "transkription": continue yield from scan_dir(entry.path) elif entry.is_file(follow_symlinks=False): yield entry except PermissionError: return def updatefileindex(): cursor = search_db.cursor() # Load folder configuration from JSON file. with open("folder_config.json", "r", encoding="utf-8") as f: config_data = json.load(f) # Process each configured base folder. for config in config_data: for folder in config.get("folders", []): foldername = folder.get("foldername") raw_folderpath = folder.get("folderpath") norm_folderpath = os.path.normpath(raw_folderpath) # Precompute the length of the base folder path (plus one for the separator) base_len = len(norm_folderpath) + 1 # Accumulate scanned file data and keys for this base folder. scanned_files = [] # Each entry: (relative_path, filename, filetype, transcript) current_keys = set() for entry in scan_dir(norm_folderpath): entry_path = os.path.normpath(entry.path) # Get relative part by slicing if possible. if entry_path.startswith(norm_folderpath): rel_part = entry_path[base_len:] else: rel_part = os.path.relpath(entry_path, norm_folderpath) # Prepend the foldername so it becomes part of the stored relative path. relative_path = os.path.join(foldername, rel_part).replace(os.sep, '/') print(relative_path) filetype = os.path.splitext(entry.name)[1].lower() transcript = None # Check for a corresponding transcript file in a sibling "Transkription" folder. parent_dir = os.path.dirname(entry_path) transcript_dir = os.path.join(parent_dir, "Transkription") transcript_filename = os.path.splitext(entry.name)[0] + ".md" transcript_path = os.path.join(transcript_dir, transcript_filename) if os.path.exists(transcript_path): try: with open(transcript_path, 'r', encoding='utf-8') as tf: transcript = tf.read() except Exception: transcript = None scanned_files.append((relative_path, entry.name, filetype, transcript)) current_keys.add((relative_path, entry.name)) # Remove database entries for files under this base folder that are no longer on disk. pattern = foldername + os.sep + '%' cursor.execute("SELECT id, relative_path, filename FROM files WHERE relative_path LIKE ?", (pattern,)) db_rows = cursor.fetchall() keys_in_db = set((row["relative_path"], row["filename"]) for row in db_rows) keys_to_delete = keys_in_db - current_keys for key in keys_to_delete: cursor.execute("DELETE FROM files WHERE relative_path = ? AND filename = ?", key) # Bulk write the scanned files using INSERT OR REPLACE. cursor.executemany( "INSERT OR REPLACE INTO files (relative_path, filename, filetype, transcript) VALUES (?, ?, ?, ?)", scanned_files ) # Commit changes after processing this base folder. search_db.commit() return "File index updated successfully" if __name__ == "__main__": init_db() # Initialize the database schema if it doesn't exist updatefileindex() # Update the file index search_db.close() # Close the database connection print("Database connection closed.")