diff --git a/index_for_search.py b/index_for_search.py index 81376e3..b26dd1e 100755 --- a/index_for_search.py +++ b/index_for_search.py @@ -2,14 +2,22 @@ import os import json import sqlite3 from datetime import datetime +from time import monotonic import re import helperfunctions as hf -from collections import defaultdict SEARCH_DB_NAME = 'search.db' ACCESS_LOG_DB_NAME = 'access_log.db' FOLDER_CONFIG = 'folder_secret_config.json' -IGNORED_DIRS = {"Transkription", "@eaDir", ".app", "#recycle"} +TRANSCRIPT_DIRNAME = "Transkription" +TRANSCRIPT_EXT = ".md" +IGNORED_DIRS = {TRANSCRIPT_DIRNAME, "@eaDir", ".app", "#recycle"} + +# Logging/progress tuning (keep output light by default) +LOG_STRUCTURE_DEPTH = int(os.getenv("INDEX_LOG_STRUCTURE_DEPTH", "0") or 0) +PROGRESS_EVERY_SECS = float(os.getenv("INDEX_PROGRESS_SECS", "30")) +PROGRESS_EVERY_FILES = int(os.getenv("INDEX_PROGRESS_FILES", "5000")) +MAX_ERROR_LOGS = int(os.getenv("INDEX_MAX_ERROR_LOGS", "5")) # Connect to the search database. search_db = sqlite3.connect(SEARCH_DB_NAME, check_same_thread=False) @@ -24,10 +32,25 @@ def log(message: str): print(message, flush=True) +def log_permission_error(path: str, stats: dict): + """Log permission errors sparingly to avoid noisy output.""" + stats["perm_errors"] += 1 + if stats["perm_errors"] <= MAX_ERROR_LOGS: + log(f"Permission denied: {path}") + elif stats["perm_errors"] == MAX_ERROR_LOGS + 1: + log("Further permission errors suppressed.") + + def skip_dir(name: str) -> bool: """Return True when a directory name should be skipped during traversal/logging.""" return name.startswith('.') or name in IGNORED_DIRS + +def format_duration(seconds: float) -> str: + total_secs = int(seconds) + minutes, secs = divmod(total_secs, 60) + return f"{minutes}m {secs:02d}s" + def init_db(): """Initializes the database with the required schema.""" cursor = search_db.cursor() @@ -59,21 +82,28 @@ def init_db(): search_db.commit() -def scan_dir(directory): - """Recursively scan directories using os.scandir for improved performance.""" - try: - with os.scandir(directory) as it: - for entry in it: - if entry.is_dir(follow_symlinks=False): - # Skip unwanted directories immediately. - if skip_dir(entry.name): - continue - yield from scan_dir(entry.path) - elif entry.is_file(follow_symlinks=False): - yield entry - except PermissionError: - log(f"Permission denied: {directory}") - return +def scan_dir(directory: str, stats: dict): + """Iteratively scan directories using os.scandir for improved performance.""" + stack = [directory] + while stack: + current = stack.pop() + stats["dirs"] += 1 + try: + with os.scandir(current) as it: + for entry in it: + try: + if entry.is_dir(follow_symlinks=False): + # Skip unwanted directories immediately. + if skip_dir(entry.name): + stats["skipped_dirs"] += 1 + continue + stack.append(entry.path) + elif entry.is_file(follow_symlinks=False): + yield entry + except PermissionError: + log_permission_error(entry.path, stats) + except PermissionError: + log_permission_error(current, stats) def get_hit_count(relative_path): """Returns the hit count for a given file from the access log database.""" @@ -94,6 +124,26 @@ def get_hit_counts_for_basefolder(basefolder: str) -> dict: return {row["rel_path"]: row["hit_count"] for row in cursor.fetchall()} +def build_transcript_index(transcript_dir: str, stats: dict): + """Return a dict of basename -> transcript path for a transcript directory.""" + try: + with os.scandir(transcript_dir) as it: + index = {} + for entry in it: + if not entry.is_file(follow_symlinks=False): + continue + name = entry.name + if not name.endswith(TRANSCRIPT_EXT): + continue + index[name[:-len(TRANSCRIPT_EXT)]] = entry.path + return index + except FileNotFoundError: + return None + except PermissionError: + log_permission_error(transcript_dir, stats) + return None + + def log_structure(root_path, max_depth=None, show_files=False): """ Log folder structure up to max_depth levels (root = depth 1). @@ -130,14 +180,8 @@ def log_file(relative_path: str, filename: str): log(f" file: {relative_path} ({filename})") -def log_directory_batch(directory: str, files: list[str]): - """Log file count for a directory without listing filenames.""" - if not files: - return - log(f" Dir {directory or '/'}: {len(files)} files") - - def updatefileindex(): + total_start = monotonic() cursor = search_db.cursor() totals = {"folders": 0, "scanned": 0, "deleted": 0} @@ -153,71 +197,102 @@ def updatefileindex(): log(f"Processing folder: {foldername}") raw_folderpath = folder.get("folderpath") norm_folderpath = os.path.normpath(raw_folderpath) - # Only log folder names up to 3 levels deep; suppress filenames - log_structure(norm_folderpath, max_depth=1, show_files=False) + # Optional shallow structure log (off by default) + if LOG_STRUCTURE_DEPTH > 0: + log_structure(norm_folderpath, max_depth=LOG_STRUCTURE_DEPTH, show_files=False) # Precompute the length of the base folder path (plus one for the separator) - base_len = len(norm_folderpath) + 1 + base_prefix = norm_folderpath + os.sep + base_len = len(base_prefix) # Prefetch hit counts for this basefolder to avoid per-file queries hitcount_map = get_hit_counts_for_basefolder(foldername) # Accumulate scanned file data and keys for this base folder. scanned_files = [] # Each entry: (relative_path, basefolder, filename, filetype, transcript, hitcount) current_keys = set() - dir_files = defaultdict(list) # map of directory -> list of filenames - for entry in scan_dir(norm_folderpath): + scan_stats = {"dirs": 0, "skipped_dirs": 0, "perm_errors": 0} + transcript_cache = {} + transcripts_read = 0 + transcript_errors = 0 + + site = None + if foldername == 'Gottesdienste Speyer': + site = 'Speyer' + elif foldername == 'Gottesdienste Schwegenheim': + site = 'Schwegenheim' + + start_time = monotonic() + last_log_time = start_time + next_log_count = PROGRESS_EVERY_FILES + scanned_count = 0 + + extract_structure = hf.extract_structure_from_string + extract_date = hf.extract_date_from_string + + for entry in scan_dir(norm_folderpath, scan_stats): transcript = None - entry_path = os.path.normpath(entry.path) + scanned_count += 1 + entry_path = entry.path # Get relative part by slicing if possible. - if entry_path.startswith(norm_folderpath): + if entry_path.startswith(base_prefix): rel_part = entry_path[base_len:] else: rel_part = os.path.relpath(entry_path, norm_folderpath) # Prepend the foldername so it becomes part of the stored relative path. - relative_path = os.path.join(foldername, rel_part).replace(os.sep, '/') - filetype = os.path.splitext(entry.name)[1].lower() + rel_part = rel_part.replace(os.sep, '/') + relative_path = f"{foldername}/{rel_part}" + name_root, name_ext = os.path.splitext(entry.name) + filetype = name_ext.lower() # Retrieve the hit count for this file from pre-fetched map. hit_count = hitcount_map.get(relative_path, 0) - # Determine the site - if foldername == 'Gottesdienste Speyer': - site = 'Speyer' - elif foldername == 'Gottesdienste Schwegenheim': - site = 'Schwegenheim' - else: - site = None - # Check for a corresponding transcript file in a sibling "Transkription" folder. parent_dir = os.path.dirname(entry_path) - transcript_dir = os.path.join(parent_dir, "Transkription") - transcript_filename = os.path.splitext(entry.name)[0] + ".md" - transcript_path = os.path.join(transcript_dir, transcript_filename) - if os.path.exists(transcript_path): - try: - with open(transcript_path, 'r', encoding='utf-8') as tf: - transcript = tf.read() - except Exception: - transcript = None + transcript_index = transcript_cache.get(parent_dir) + if transcript_index is None and parent_dir not in transcript_cache: + transcript_dir = os.path.join(parent_dir, TRANSCRIPT_DIRNAME) + transcript_index = build_transcript_index(transcript_dir, scan_stats) + transcript_cache[parent_dir] = transcript_index + if transcript_index: + transcript_path = transcript_index.get(name_root) + if transcript_path: + try: + with open(transcript_path, 'r', encoding='utf-8') as tf: + transcript = tf.read() + transcripts_read += 1 + except Exception: + transcript_errors += 1 - category, titel, name = hf.extract_structure_from_string(entry.name) + category, titel, name = extract_structure(entry.name) - performance_date = hf.extract_date_from_string(relative_path) - - # Debug: batch file logging per directory - dir_files[os.path.dirname(relative_path)].append(entry.name) + performance_date = extract_date(relative_path) scanned_files.append((relative_path, foldername, entry.name, filetype, category, titel, name, performance_date, site, transcript, hit_count)) current_keys.add((relative_path, entry.name)) - # After scanning, log grouped files per directory - for d, files in dir_files.items(): - log_directory_batch(d, files) + # Light progress output + now = monotonic() + if scanned_count >= next_log_count or (now - last_log_time) >= PROGRESS_EVERY_SECS: + elapsed = max(now - start_time, 0.0001) + rate = scanned_count / elapsed + log(f" progress: {scanned_count} files, {scan_stats['dirs']} dirs, {rate:.1f} files/s") + last_log_time = now + next_log_count = scanned_count + PROGRESS_EVERY_FILES # Progress indicator - dir_count = len(dir_files) - file_count = len(scanned_files) - log(f"Found {dir_count} folders and {file_count} files in '{foldername}'.") + dir_count = scan_stats["dirs"] + file_count = scanned_count + elapsed = max(monotonic() - start_time, 0.0001) + rate = file_count / elapsed + log(f"Scan summary for '{foldername}': {dir_count} dirs, {file_count} files, {rate:.1f} files/s") + if scan_stats["skipped_dirs"]: + log(f" skipped dirs: {scan_stats['skipped_dirs']}") + if scan_stats["perm_errors"]: + log(f" permission errors: {scan_stats['perm_errors']}") + if transcripts_read or transcript_errors: + log(f" transcripts: {transcripts_read} read, {transcript_errors} errors") log("updating database...") + scan_duration = format_duration(elapsed) # Remove database entries for files under this base folder that are no longer on disk. pattern = foldername + os.sep + '%' @@ -238,11 +313,14 @@ def updatefileindex(): # Commit changes after processing this base folder. search_db.commit() - folder_scanned = len(scanned_files) + folder_scanned = scanned_count totals["scanned"] += folder_scanned log(f"Indexed {folder_scanned} files (deleted {deleted_count}) in '{foldername}'") + log(f"Scan duration for '{foldername}': {scan_duration}") + total_elapsed = max(monotonic() - total_start, 0.0001) log(f"Index update finished: folders={totals['folders']}, files indexed={totals['scanned']}, removed={totals['deleted']}") + log(f"Total index duration: {format_duration(total_elapsed)}") return "File index updated successfully" def convert_dates(search_db,