improve indexing

This commit is contained in:
lelo 2026-01-24 16:07:44 +00:00
parent c7c52f0dc2
commit a5930cb506

View File

@ -2,14 +2,22 @@ import os
import json import json
import sqlite3 import sqlite3
from datetime import datetime from datetime import datetime
from time import monotonic
import re import re
import helperfunctions as hf import helperfunctions as hf
from collections import defaultdict
SEARCH_DB_NAME = 'search.db' SEARCH_DB_NAME = 'search.db'
ACCESS_LOG_DB_NAME = 'access_log.db' ACCESS_LOG_DB_NAME = 'access_log.db'
FOLDER_CONFIG = 'folder_secret_config.json' FOLDER_CONFIG = 'folder_secret_config.json'
IGNORED_DIRS = {"Transkription", "@eaDir", ".app", "#recycle"} TRANSCRIPT_DIRNAME = "Transkription"
TRANSCRIPT_EXT = ".md"
IGNORED_DIRS = {TRANSCRIPT_DIRNAME, "@eaDir", ".app", "#recycle"}
# Logging/progress tuning (keep output light by default)
LOG_STRUCTURE_DEPTH = int(os.getenv("INDEX_LOG_STRUCTURE_DEPTH", "0") or 0)
PROGRESS_EVERY_SECS = float(os.getenv("INDEX_PROGRESS_SECS", "30"))
PROGRESS_EVERY_FILES = int(os.getenv("INDEX_PROGRESS_FILES", "5000"))
MAX_ERROR_LOGS = int(os.getenv("INDEX_MAX_ERROR_LOGS", "5"))
# Connect to the search database. # Connect to the search database.
search_db = sqlite3.connect(SEARCH_DB_NAME, check_same_thread=False) search_db = sqlite3.connect(SEARCH_DB_NAME, check_same_thread=False)
@ -24,10 +32,25 @@ def log(message: str):
print(message, flush=True) print(message, flush=True)
def log_permission_error(path: str, stats: dict):
"""Log permission errors sparingly to avoid noisy output."""
stats["perm_errors"] += 1
if stats["perm_errors"] <= MAX_ERROR_LOGS:
log(f"Permission denied: {path}")
elif stats["perm_errors"] == MAX_ERROR_LOGS + 1:
log("Further permission errors suppressed.")
def skip_dir(name: str) -> bool: def skip_dir(name: str) -> bool:
"""Return True when a directory name should be skipped during traversal/logging.""" """Return True when a directory name should be skipped during traversal/logging."""
return name.startswith('.') or name in IGNORED_DIRS return name.startswith('.') or name in IGNORED_DIRS
def format_duration(seconds: float) -> str:
total_secs = int(seconds)
minutes, secs = divmod(total_secs, 60)
return f"{minutes}m {secs:02d}s"
def init_db(): def init_db():
"""Initializes the database with the required schema.""" """Initializes the database with the required schema."""
cursor = search_db.cursor() cursor = search_db.cursor()
@ -59,21 +82,28 @@ def init_db():
search_db.commit() search_db.commit()
def scan_dir(directory): def scan_dir(directory: str, stats: dict):
"""Recursively scan directories using os.scandir for improved performance.""" """Iteratively scan directories using os.scandir for improved performance."""
try: stack = [directory]
with os.scandir(directory) as it: while stack:
for entry in it: current = stack.pop()
if entry.is_dir(follow_symlinks=False): stats["dirs"] += 1
# Skip unwanted directories immediately. try:
if skip_dir(entry.name): with os.scandir(current) as it:
continue for entry in it:
yield from scan_dir(entry.path) try:
elif entry.is_file(follow_symlinks=False): if entry.is_dir(follow_symlinks=False):
yield entry # Skip unwanted directories immediately.
except PermissionError: if skip_dir(entry.name):
log(f"Permission denied: {directory}") stats["skipped_dirs"] += 1
return continue
stack.append(entry.path)
elif entry.is_file(follow_symlinks=False):
yield entry
except PermissionError:
log_permission_error(entry.path, stats)
except PermissionError:
log_permission_error(current, stats)
def get_hit_count(relative_path): def get_hit_count(relative_path):
"""Returns the hit count for a given file from the access log database.""" """Returns the hit count for a given file from the access log database."""
@ -94,6 +124,26 @@ def get_hit_counts_for_basefolder(basefolder: str) -> dict:
return {row["rel_path"]: row["hit_count"] for row in cursor.fetchall()} return {row["rel_path"]: row["hit_count"] for row in cursor.fetchall()}
def build_transcript_index(transcript_dir: str, stats: dict):
"""Return a dict of basename -> transcript path for a transcript directory."""
try:
with os.scandir(transcript_dir) as it:
index = {}
for entry in it:
if not entry.is_file(follow_symlinks=False):
continue
name = entry.name
if not name.endswith(TRANSCRIPT_EXT):
continue
index[name[:-len(TRANSCRIPT_EXT)]] = entry.path
return index
except FileNotFoundError:
return None
except PermissionError:
log_permission_error(transcript_dir, stats)
return None
def log_structure(root_path, max_depth=None, show_files=False): def log_structure(root_path, max_depth=None, show_files=False):
""" """
Log folder structure up to max_depth levels (root = depth 1). Log folder structure up to max_depth levels (root = depth 1).
@ -130,14 +180,8 @@ def log_file(relative_path: str, filename: str):
log(f" file: {relative_path} ({filename})") log(f" file: {relative_path} ({filename})")
def log_directory_batch(directory: str, files: list[str]):
"""Log file count for a directory without listing filenames."""
if not files:
return
log(f" Dir {directory or '/'}: {len(files)} files")
def updatefileindex(): def updatefileindex():
total_start = monotonic()
cursor = search_db.cursor() cursor = search_db.cursor()
totals = {"folders": 0, "scanned": 0, "deleted": 0} totals = {"folders": 0, "scanned": 0, "deleted": 0}
@ -153,71 +197,102 @@ def updatefileindex():
log(f"Processing folder: {foldername}") log(f"Processing folder: {foldername}")
raw_folderpath = folder.get("folderpath") raw_folderpath = folder.get("folderpath")
norm_folderpath = os.path.normpath(raw_folderpath) norm_folderpath = os.path.normpath(raw_folderpath)
# Only log folder names up to 3 levels deep; suppress filenames # Optional shallow structure log (off by default)
log_structure(norm_folderpath, max_depth=1, show_files=False) if LOG_STRUCTURE_DEPTH > 0:
log_structure(norm_folderpath, max_depth=LOG_STRUCTURE_DEPTH, show_files=False)
# Precompute the length of the base folder path (plus one for the separator) # Precompute the length of the base folder path (plus one for the separator)
base_len = len(norm_folderpath) + 1 base_prefix = norm_folderpath + os.sep
base_len = len(base_prefix)
# Prefetch hit counts for this basefolder to avoid per-file queries # Prefetch hit counts for this basefolder to avoid per-file queries
hitcount_map = get_hit_counts_for_basefolder(foldername) hitcount_map = get_hit_counts_for_basefolder(foldername)
# Accumulate scanned file data and keys for this base folder. # Accumulate scanned file data and keys for this base folder.
scanned_files = [] # Each entry: (relative_path, basefolder, filename, filetype, transcript, hitcount) scanned_files = [] # Each entry: (relative_path, basefolder, filename, filetype, transcript, hitcount)
current_keys = set() current_keys = set()
dir_files = defaultdict(list) # map of directory -> list of filenames scan_stats = {"dirs": 0, "skipped_dirs": 0, "perm_errors": 0}
for entry in scan_dir(norm_folderpath): transcript_cache = {}
transcripts_read = 0
transcript_errors = 0
site = None
if foldername == 'Gottesdienste Speyer':
site = 'Speyer'
elif foldername == 'Gottesdienste Schwegenheim':
site = 'Schwegenheim'
start_time = monotonic()
last_log_time = start_time
next_log_count = PROGRESS_EVERY_FILES
scanned_count = 0
extract_structure = hf.extract_structure_from_string
extract_date = hf.extract_date_from_string
for entry in scan_dir(norm_folderpath, scan_stats):
transcript = None transcript = None
entry_path = os.path.normpath(entry.path) scanned_count += 1
entry_path = entry.path
# Get relative part by slicing if possible. # Get relative part by slicing if possible.
if entry_path.startswith(norm_folderpath): if entry_path.startswith(base_prefix):
rel_part = entry_path[base_len:] rel_part = entry_path[base_len:]
else: else:
rel_part = os.path.relpath(entry_path, norm_folderpath) rel_part = os.path.relpath(entry_path, norm_folderpath)
# Prepend the foldername so it becomes part of the stored relative path. # Prepend the foldername so it becomes part of the stored relative path.
relative_path = os.path.join(foldername, rel_part).replace(os.sep, '/') rel_part = rel_part.replace(os.sep, '/')
filetype = os.path.splitext(entry.name)[1].lower() relative_path = f"{foldername}/{rel_part}"
name_root, name_ext = os.path.splitext(entry.name)
filetype = name_ext.lower()
# Retrieve the hit count for this file from pre-fetched map. # Retrieve the hit count for this file from pre-fetched map.
hit_count = hitcount_map.get(relative_path, 0) hit_count = hitcount_map.get(relative_path, 0)
# Determine the site
if foldername == 'Gottesdienste Speyer':
site = 'Speyer'
elif foldername == 'Gottesdienste Schwegenheim':
site = 'Schwegenheim'
else:
site = None
# Check for a corresponding transcript file in a sibling "Transkription" folder. # Check for a corresponding transcript file in a sibling "Transkription" folder.
parent_dir = os.path.dirname(entry_path) parent_dir = os.path.dirname(entry_path)
transcript_dir = os.path.join(parent_dir, "Transkription") transcript_index = transcript_cache.get(parent_dir)
transcript_filename = os.path.splitext(entry.name)[0] + ".md" if transcript_index is None and parent_dir not in transcript_cache:
transcript_path = os.path.join(transcript_dir, transcript_filename) transcript_dir = os.path.join(parent_dir, TRANSCRIPT_DIRNAME)
if os.path.exists(transcript_path): transcript_index = build_transcript_index(transcript_dir, scan_stats)
try: transcript_cache[parent_dir] = transcript_index
with open(transcript_path, 'r', encoding='utf-8') as tf: if transcript_index:
transcript = tf.read() transcript_path = transcript_index.get(name_root)
except Exception: if transcript_path:
transcript = None try:
with open(transcript_path, 'r', encoding='utf-8') as tf:
transcript = tf.read()
transcripts_read += 1
except Exception:
transcript_errors += 1
category, titel, name = hf.extract_structure_from_string(entry.name) category, titel, name = extract_structure(entry.name)
performance_date = hf.extract_date_from_string(relative_path) performance_date = extract_date(relative_path)
# Debug: batch file logging per directory
dir_files[os.path.dirname(relative_path)].append(entry.name)
scanned_files.append((relative_path, foldername, entry.name, filetype, category, titel, name, performance_date, site, transcript, hit_count)) scanned_files.append((relative_path, foldername, entry.name, filetype, category, titel, name, performance_date, site, transcript, hit_count))
current_keys.add((relative_path, entry.name)) current_keys.add((relative_path, entry.name))
# After scanning, log grouped files per directory # Light progress output
for d, files in dir_files.items(): now = monotonic()
log_directory_batch(d, files) if scanned_count >= next_log_count or (now - last_log_time) >= PROGRESS_EVERY_SECS:
elapsed = max(now - start_time, 0.0001)
rate = scanned_count / elapsed
log(f" progress: {scanned_count} files, {scan_stats['dirs']} dirs, {rate:.1f} files/s")
last_log_time = now
next_log_count = scanned_count + PROGRESS_EVERY_FILES
# Progress indicator # Progress indicator
dir_count = len(dir_files) dir_count = scan_stats["dirs"]
file_count = len(scanned_files) file_count = scanned_count
log(f"Found {dir_count} folders and {file_count} files in '{foldername}'.") elapsed = max(monotonic() - start_time, 0.0001)
rate = file_count / elapsed
log(f"Scan summary for '{foldername}': {dir_count} dirs, {file_count} files, {rate:.1f} files/s")
if scan_stats["skipped_dirs"]:
log(f" skipped dirs: {scan_stats['skipped_dirs']}")
if scan_stats["perm_errors"]:
log(f" permission errors: {scan_stats['perm_errors']}")
if transcripts_read or transcript_errors:
log(f" transcripts: {transcripts_read} read, {transcript_errors} errors")
log("updating database...") log("updating database...")
scan_duration = format_duration(elapsed)
# Remove database entries for files under this base folder that are no longer on disk. # Remove database entries for files under this base folder that are no longer on disk.
pattern = foldername + os.sep + '%' pattern = foldername + os.sep + '%'
@ -238,11 +313,14 @@ def updatefileindex():
# Commit changes after processing this base folder. # Commit changes after processing this base folder.
search_db.commit() search_db.commit()
folder_scanned = len(scanned_files) folder_scanned = scanned_count
totals["scanned"] += folder_scanned totals["scanned"] += folder_scanned
log(f"Indexed {folder_scanned} files (deleted {deleted_count}) in '{foldername}'") log(f"Indexed {folder_scanned} files (deleted {deleted_count}) in '{foldername}'")
log(f"Scan duration for '{foldername}': {scan_duration}")
total_elapsed = max(monotonic() - total_start, 0.0001)
log(f"Index update finished: folders={totals['folders']}, files indexed={totals['scanned']}, removed={totals['deleted']}") log(f"Index update finished: folders={totals['folders']}, files indexed={totals['scanned']}, removed={totals['deleted']}")
log(f"Total index duration: {format_duration(total_elapsed)}")
return "File index updated successfully" return "File index updated successfully"
def convert_dates(search_db, def convert_dates(search_db,