improve indexing

2026-01-24 16:07:44 +00:00 · 2026-01-24 16:07:44 +00:00 · a5930cb506
commit a5930cb506
parent c7c52f0dc2
1 changed files with 140 additions and 62 deletions
--- a/index_for_search.py
+++ b/index_for_search.py
@ -2,14 +2,22 @@ import os
 import json
 import sqlite3
 from datetime import datetime
+from time import monotonic
 import re
 import helperfunctions as hf
-from collections import defaultdict

 SEARCH_DB_NAME = 'search.db'
 ACCESS_LOG_DB_NAME = 'access_log.db'
 FOLDER_CONFIG = 'folder_secret_config.json'
-IGNORED_DIRS = {"Transkription", "@eaDir", ".app", "#recycle"}
+TRANSCRIPT_DIRNAME = "Transkription"
+TRANSCRIPT_EXT = ".md"
+IGNORED_DIRS = {TRANSCRIPT_DIRNAME, "@eaDir", ".app", "#recycle"}
+
+# Logging/progress tuning (keep output light by default)
+LOG_STRUCTURE_DEPTH = int(os.getenv("INDEX_LOG_STRUCTURE_DEPTH", "0") or 0)
+PROGRESS_EVERY_SECS = float(os.getenv("INDEX_PROGRESS_SECS", "30"))
+PROGRESS_EVERY_FILES = int(os.getenv("INDEX_PROGRESS_FILES", "5000"))
+MAX_ERROR_LOGS = int(os.getenv("INDEX_MAX_ERROR_LOGS", "5"))

 # Connect to the search database.
 search_db = sqlite3.connect(SEARCH_DB_NAME, check_same_thread=False)
@ -24,10 +32,25 @@ def log(message: str):
    print(message, flush=True)


+def log_permission_error(path: str, stats: dict):
+    """Log permission errors sparingly to avoid noisy output."""
+    stats["perm_errors"] += 1
+    if stats["perm_errors"] <= MAX_ERROR_LOGS:
+        log(f"Permission denied: {path}")
+    elif stats["perm_errors"] == MAX_ERROR_LOGS + 1:
+        log("Further permission errors suppressed.")
+
+
 def skip_dir(name: str) -> bool:
    """Return True when a directory name should be skipped during traversal/logging."""
    return name.startswith('.') or name in IGNORED_DIRS

+
+def format_duration(seconds: float) -> str:
+    total_secs = int(seconds)
+    minutes, secs = divmod(total_secs, 60)
+    return f"{minutes}m {secs:02d}s"
+
 def init_db():
    """Initializes the database with the required schema."""
    cursor = search_db.cursor()
@ -59,21 +82,28 @@ def init_db():
    
    search_db.commit()

-def scan_dir(directory):
-    """Recursively scan directories using os.scandir for improved performance."""
-    try:
-        with os.scandir(directory) as it:
-            for entry in it:
-                if entry.is_dir(follow_symlinks=False):
-                    # Skip unwanted directories immediately.
-                    if skip_dir(entry.name):
-                        continue
-                    yield from scan_dir(entry.path)
-                elif entry.is_file(follow_symlinks=False):
-                    yield entry
-    except PermissionError:
-        log(f"Permission denied: {directory}")
-        return
+def scan_dir(directory: str, stats: dict):
+    """Iteratively scan directories using os.scandir for improved performance."""
+    stack = [directory]
+    while stack:
+        current = stack.pop()
+        stats["dirs"] += 1
+        try:
+            with os.scandir(current) as it:
+                for entry in it:
+                    try:
+                        if entry.is_dir(follow_symlinks=False):
+                            # Skip unwanted directories immediately.
+                            if skip_dir(entry.name):
+                                stats["skipped_dirs"] += 1
+                                continue
+                            stack.append(entry.path)
+                        elif entry.is_file(follow_symlinks=False):
+                            yield entry
+                    except PermissionError:
+                        log_permission_error(entry.path, stats)
+        except PermissionError:
+            log_permission_error(current, stats)

 def get_hit_count(relative_path):
    """Returns the hit count for a given file from the access log database."""
@ -94,6 +124,26 @@ def get_hit_counts_for_basefolder(basefolder: str) -> dict:
    return {row["rel_path"]: row["hit_count"] for row in cursor.fetchall()}


+def build_transcript_index(transcript_dir: str, stats: dict):
+    """Return a dict of basename -> transcript path for a transcript directory."""
+    try:
+        with os.scandir(transcript_dir) as it:
+            index = {}
+            for entry in it:
+                if not entry.is_file(follow_symlinks=False):
+                    continue
+                name = entry.name
+                if not name.endswith(TRANSCRIPT_EXT):
+                    continue
+                index[name[:-len(TRANSCRIPT_EXT)]] = entry.path
+            return index
+    except FileNotFoundError:
+        return None
+    except PermissionError:
+        log_permission_error(transcript_dir, stats)
+        return None
+
+
 def log_structure(root_path, max_depth=None, show_files=False):
    """
    Log folder structure up to max_depth levels (root = depth 1).
@ -130,14 +180,8 @@ def log_file(relative_path: str, filename: str):
    log(f"  file: {relative_path} ({filename})")


-def log_directory_batch(directory: str, files: list[str]):
-    """Log file count for a directory without listing filenames."""
-    if not files:
-        return
-    log(f" Dir {directory or '/'}: {len(files)} files")
-
-
 def updatefileindex():
+    total_start = monotonic()
    cursor = search_db.cursor()
    totals = {"folders": 0, "scanned": 0, "deleted": 0}

@ -153,71 +197,102 @@ def updatefileindex():
            log(f"Processing folder: {foldername}")
            raw_folderpath = folder.get("folderpath")
            norm_folderpath = os.path.normpath(raw_folderpath)
-            # Only log folder names up to 3 levels deep; suppress filenames
-            log_structure(norm_folderpath, max_depth=1, show_files=False)
+            # Optional shallow structure log (off by default)
+            if LOG_STRUCTURE_DEPTH > 0:
+                log_structure(norm_folderpath, max_depth=LOG_STRUCTURE_DEPTH, show_files=False)
            # Precompute the length of the base folder path (plus one for the separator)
-            base_len = len(norm_folderpath) + 1
+            base_prefix = norm_folderpath + os.sep
+            base_len = len(base_prefix)
            # Prefetch hit counts for this basefolder to avoid per-file queries
            hitcount_map = get_hit_counts_for_basefolder(foldername)

            # Accumulate scanned file data and keys for this base folder.
            scanned_files = []  # Each entry: (relative_path, basefolder, filename, filetype, transcript, hitcount)
            current_keys = set()
-            dir_files = defaultdict(list)  # map of directory -> list of filenames
-            for entry in scan_dir(norm_folderpath):
+            scan_stats = {"dirs": 0, "skipped_dirs": 0, "perm_errors": 0}
+            transcript_cache = {}
+            transcripts_read = 0
+            transcript_errors = 0
+
+            site = None
+            if foldername == 'Gottesdienste Speyer':
+                site = 'Speyer'
+            elif foldername == 'Gottesdienste Schwegenheim':
+                site = 'Schwegenheim'
+
+            start_time = monotonic()
+            last_log_time = start_time
+            next_log_count = PROGRESS_EVERY_FILES
+            scanned_count = 0
+
+            extract_structure = hf.extract_structure_from_string
+            extract_date = hf.extract_date_from_string
+
+            for entry in scan_dir(norm_folderpath, scan_stats):
                transcript = None
-                entry_path = os.path.normpath(entry.path)
+                scanned_count += 1
+                entry_path = entry.path
                # Get relative part by slicing if possible.
-                if entry_path.startswith(norm_folderpath):
+                if entry_path.startswith(base_prefix):
                    rel_part = entry_path[base_len:]
                else:
                    rel_part = os.path.relpath(entry_path, norm_folderpath)
                # Prepend the foldername so it becomes part of the stored relative path.
-                relative_path = os.path.join(foldername, rel_part).replace(os.sep, '/')
-                filetype = os.path.splitext(entry.name)[1].lower()
+                rel_part = rel_part.replace(os.sep, '/')
+                relative_path = f"{foldername}/{rel_part}"
+                name_root, name_ext = os.path.splitext(entry.name)
+                filetype = name_ext.lower()
                
                # Retrieve the hit count for this file from pre-fetched map.
                hit_count = hitcount_map.get(relative_path, 0)
                
-                # Determine the site
-                if foldername == 'Gottesdienste Speyer':
-                    site = 'Speyer'
-                elif foldername == 'Gottesdienste Schwegenheim':
-                    site = 'Schwegenheim'
-                else:
-                    site = None
-
                # Check for a corresponding transcript file in a sibling "Transkription" folder.
                parent_dir = os.path.dirname(entry_path)
-                transcript_dir = os.path.join(parent_dir, "Transkription")
-                transcript_filename = os.path.splitext(entry.name)[0] + ".md"
-                transcript_path = os.path.join(transcript_dir, transcript_filename)
-                if os.path.exists(transcript_path):
-                    try:
-                        with open(transcript_path, 'r', encoding='utf-8') as tf:
-                            transcript = tf.read()
-                    except Exception:
-                        transcript = None
+                transcript_index = transcript_cache.get(parent_dir)
+                if transcript_index is None and parent_dir not in transcript_cache:
+                    transcript_dir = os.path.join(parent_dir, TRANSCRIPT_DIRNAME)
+                    transcript_index = build_transcript_index(transcript_dir, scan_stats)
+                    transcript_cache[parent_dir] = transcript_index
+                if transcript_index:
+                    transcript_path = transcript_index.get(name_root)
+                    if transcript_path:
+                        try:
+                            with open(transcript_path, 'r', encoding='utf-8') as tf:
+                                transcript = tf.read()
+                            transcripts_read += 1
+                        except Exception:
+                            transcript_errors += 1

-                category, titel, name = hf.extract_structure_from_string(entry.name)
+                category, titel, name = extract_structure(entry.name)
                    
-                performance_date = hf.extract_date_from_string(relative_path)
-
-                # Debug: batch file logging per directory
-                dir_files[os.path.dirname(relative_path)].append(entry.name)
+                performance_date = extract_date(relative_path)

                scanned_files.append((relative_path, foldername, entry.name, filetype, category, titel, name, performance_date, site, transcript, hit_count))
                current_keys.add((relative_path, entry.name))

-            # After scanning, log grouped files per directory
-            for d, files in dir_files.items():
-                log_directory_batch(d, files)
+                # Light progress output
+                now = monotonic()
+                if scanned_count >= next_log_count or (now - last_log_time) >= PROGRESS_EVERY_SECS:
+                    elapsed = max(now - start_time, 0.0001)
+                    rate = scanned_count / elapsed
+                    log(f"  progress: {scanned_count} files, {scan_stats['dirs']} dirs, {rate:.1f} files/s")
+                    last_log_time = now
+                    next_log_count = scanned_count + PROGRESS_EVERY_FILES

            # Progress indicator
-            dir_count = len(dir_files)
-            file_count = len(scanned_files)
-            log(f"Found {dir_count} folders and {file_count} files in '{foldername}'.")
+            dir_count = scan_stats["dirs"]
+            file_count = scanned_count
+            elapsed = max(monotonic() - start_time, 0.0001)
+            rate = file_count / elapsed
+            log(f"Scan summary for '{foldername}': {dir_count} dirs, {file_count} files, {rate:.1f} files/s")
+            if scan_stats["skipped_dirs"]:
+                log(f"  skipped dirs: {scan_stats['skipped_dirs']}")
+            if scan_stats["perm_errors"]:
+                log(f"  permission errors: {scan_stats['perm_errors']}")
+            if transcripts_read or transcript_errors:
+                log(f"  transcripts: {transcripts_read} read, {transcript_errors} errors")
            log("updating database...")
+            scan_duration = format_duration(elapsed)

            # Remove database entries for files under this base folder that are no longer on disk.
            pattern = foldername + os.sep + '%'
@ -238,11 +313,14 @@ def updatefileindex():

            # Commit changes after processing this base folder.
            search_db.commit()
-            folder_scanned = len(scanned_files)
+            folder_scanned = scanned_count
            totals["scanned"] += folder_scanned
            log(f"Indexed {folder_scanned} files (deleted {deleted_count}) in '{foldername}'")
+            log(f"Scan duration for '{foldername}': {scan_duration}")

+    total_elapsed = max(monotonic() - total_start, 0.0001)
    log(f"Index update finished: folders={totals['folders']}, files indexed={totals['scanned']}, removed={totals['deleted']}")
+    log(f"Total index duration: {format_duration(total_elapsed)}")
    return "File index updated successfully"

 def convert_dates(search_db,