improve indexing
This commit is contained in:
parent
c7c52f0dc2
commit
a5930cb506
@ -2,14 +2,22 @@ import os
|
|||||||
import json
|
import json
|
||||||
import sqlite3
|
import sqlite3
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from time import monotonic
|
||||||
import re
|
import re
|
||||||
import helperfunctions as hf
|
import helperfunctions as hf
|
||||||
from collections import defaultdict
|
|
||||||
|
|
||||||
SEARCH_DB_NAME = 'search.db'
|
SEARCH_DB_NAME = 'search.db'
|
||||||
ACCESS_LOG_DB_NAME = 'access_log.db'
|
ACCESS_LOG_DB_NAME = 'access_log.db'
|
||||||
FOLDER_CONFIG = 'folder_secret_config.json'
|
FOLDER_CONFIG = 'folder_secret_config.json'
|
||||||
IGNORED_DIRS = {"Transkription", "@eaDir", ".app", "#recycle"}
|
TRANSCRIPT_DIRNAME = "Transkription"
|
||||||
|
TRANSCRIPT_EXT = ".md"
|
||||||
|
IGNORED_DIRS = {TRANSCRIPT_DIRNAME, "@eaDir", ".app", "#recycle"}
|
||||||
|
|
||||||
|
# Logging/progress tuning (keep output light by default)
|
||||||
|
LOG_STRUCTURE_DEPTH = int(os.getenv("INDEX_LOG_STRUCTURE_DEPTH", "0") or 0)
|
||||||
|
PROGRESS_EVERY_SECS = float(os.getenv("INDEX_PROGRESS_SECS", "30"))
|
||||||
|
PROGRESS_EVERY_FILES = int(os.getenv("INDEX_PROGRESS_FILES", "5000"))
|
||||||
|
MAX_ERROR_LOGS = int(os.getenv("INDEX_MAX_ERROR_LOGS", "5"))
|
||||||
|
|
||||||
# Connect to the search database.
|
# Connect to the search database.
|
||||||
search_db = sqlite3.connect(SEARCH_DB_NAME, check_same_thread=False)
|
search_db = sqlite3.connect(SEARCH_DB_NAME, check_same_thread=False)
|
||||||
@ -24,10 +32,25 @@ def log(message: str):
|
|||||||
print(message, flush=True)
|
print(message, flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
def log_permission_error(path: str, stats: dict):
|
||||||
|
"""Log permission errors sparingly to avoid noisy output."""
|
||||||
|
stats["perm_errors"] += 1
|
||||||
|
if stats["perm_errors"] <= MAX_ERROR_LOGS:
|
||||||
|
log(f"Permission denied: {path}")
|
||||||
|
elif stats["perm_errors"] == MAX_ERROR_LOGS + 1:
|
||||||
|
log("Further permission errors suppressed.")
|
||||||
|
|
||||||
|
|
||||||
def skip_dir(name: str) -> bool:
|
def skip_dir(name: str) -> bool:
|
||||||
"""Return True when a directory name should be skipped during traversal/logging."""
|
"""Return True when a directory name should be skipped during traversal/logging."""
|
||||||
return name.startswith('.') or name in IGNORED_DIRS
|
return name.startswith('.') or name in IGNORED_DIRS
|
||||||
|
|
||||||
|
|
||||||
|
def format_duration(seconds: float) -> str:
|
||||||
|
total_secs = int(seconds)
|
||||||
|
minutes, secs = divmod(total_secs, 60)
|
||||||
|
return f"{minutes}m {secs:02d}s"
|
||||||
|
|
||||||
def init_db():
|
def init_db():
|
||||||
"""Initializes the database with the required schema."""
|
"""Initializes the database with the required schema."""
|
||||||
cursor = search_db.cursor()
|
cursor = search_db.cursor()
|
||||||
@ -59,21 +82,28 @@ def init_db():
|
|||||||
|
|
||||||
search_db.commit()
|
search_db.commit()
|
||||||
|
|
||||||
def scan_dir(directory):
|
def scan_dir(directory: str, stats: dict):
|
||||||
"""Recursively scan directories using os.scandir for improved performance."""
|
"""Iteratively scan directories using os.scandir for improved performance."""
|
||||||
try:
|
stack = [directory]
|
||||||
with os.scandir(directory) as it:
|
while stack:
|
||||||
for entry in it:
|
current = stack.pop()
|
||||||
if entry.is_dir(follow_symlinks=False):
|
stats["dirs"] += 1
|
||||||
# Skip unwanted directories immediately.
|
try:
|
||||||
if skip_dir(entry.name):
|
with os.scandir(current) as it:
|
||||||
continue
|
for entry in it:
|
||||||
yield from scan_dir(entry.path)
|
try:
|
||||||
elif entry.is_file(follow_symlinks=False):
|
if entry.is_dir(follow_symlinks=False):
|
||||||
yield entry
|
# Skip unwanted directories immediately.
|
||||||
except PermissionError:
|
if skip_dir(entry.name):
|
||||||
log(f"Permission denied: {directory}")
|
stats["skipped_dirs"] += 1
|
||||||
return
|
continue
|
||||||
|
stack.append(entry.path)
|
||||||
|
elif entry.is_file(follow_symlinks=False):
|
||||||
|
yield entry
|
||||||
|
except PermissionError:
|
||||||
|
log_permission_error(entry.path, stats)
|
||||||
|
except PermissionError:
|
||||||
|
log_permission_error(current, stats)
|
||||||
|
|
||||||
def get_hit_count(relative_path):
|
def get_hit_count(relative_path):
|
||||||
"""Returns the hit count for a given file from the access log database."""
|
"""Returns the hit count for a given file from the access log database."""
|
||||||
@ -94,6 +124,26 @@ def get_hit_counts_for_basefolder(basefolder: str) -> dict:
|
|||||||
return {row["rel_path"]: row["hit_count"] for row in cursor.fetchall()}
|
return {row["rel_path"]: row["hit_count"] for row in cursor.fetchall()}
|
||||||
|
|
||||||
|
|
||||||
|
def build_transcript_index(transcript_dir: str, stats: dict):
|
||||||
|
"""Return a dict of basename -> transcript path for a transcript directory."""
|
||||||
|
try:
|
||||||
|
with os.scandir(transcript_dir) as it:
|
||||||
|
index = {}
|
||||||
|
for entry in it:
|
||||||
|
if not entry.is_file(follow_symlinks=False):
|
||||||
|
continue
|
||||||
|
name = entry.name
|
||||||
|
if not name.endswith(TRANSCRIPT_EXT):
|
||||||
|
continue
|
||||||
|
index[name[:-len(TRANSCRIPT_EXT)]] = entry.path
|
||||||
|
return index
|
||||||
|
except FileNotFoundError:
|
||||||
|
return None
|
||||||
|
except PermissionError:
|
||||||
|
log_permission_error(transcript_dir, stats)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def log_structure(root_path, max_depth=None, show_files=False):
|
def log_structure(root_path, max_depth=None, show_files=False):
|
||||||
"""
|
"""
|
||||||
Log folder structure up to max_depth levels (root = depth 1).
|
Log folder structure up to max_depth levels (root = depth 1).
|
||||||
@ -130,14 +180,8 @@ def log_file(relative_path: str, filename: str):
|
|||||||
log(f" file: {relative_path} ({filename})")
|
log(f" file: {relative_path} ({filename})")
|
||||||
|
|
||||||
|
|
||||||
def log_directory_batch(directory: str, files: list[str]):
|
|
||||||
"""Log file count for a directory without listing filenames."""
|
|
||||||
if not files:
|
|
||||||
return
|
|
||||||
log(f" Dir {directory or '/'}: {len(files)} files")
|
|
||||||
|
|
||||||
|
|
||||||
def updatefileindex():
|
def updatefileindex():
|
||||||
|
total_start = monotonic()
|
||||||
cursor = search_db.cursor()
|
cursor = search_db.cursor()
|
||||||
totals = {"folders": 0, "scanned": 0, "deleted": 0}
|
totals = {"folders": 0, "scanned": 0, "deleted": 0}
|
||||||
|
|
||||||
@ -153,71 +197,102 @@ def updatefileindex():
|
|||||||
log(f"Processing folder: {foldername}")
|
log(f"Processing folder: {foldername}")
|
||||||
raw_folderpath = folder.get("folderpath")
|
raw_folderpath = folder.get("folderpath")
|
||||||
norm_folderpath = os.path.normpath(raw_folderpath)
|
norm_folderpath = os.path.normpath(raw_folderpath)
|
||||||
# Only log folder names up to 3 levels deep; suppress filenames
|
# Optional shallow structure log (off by default)
|
||||||
log_structure(norm_folderpath, max_depth=1, show_files=False)
|
if LOG_STRUCTURE_DEPTH > 0:
|
||||||
|
log_structure(norm_folderpath, max_depth=LOG_STRUCTURE_DEPTH, show_files=False)
|
||||||
# Precompute the length of the base folder path (plus one for the separator)
|
# Precompute the length of the base folder path (plus one for the separator)
|
||||||
base_len = len(norm_folderpath) + 1
|
base_prefix = norm_folderpath + os.sep
|
||||||
|
base_len = len(base_prefix)
|
||||||
# Prefetch hit counts for this basefolder to avoid per-file queries
|
# Prefetch hit counts for this basefolder to avoid per-file queries
|
||||||
hitcount_map = get_hit_counts_for_basefolder(foldername)
|
hitcount_map = get_hit_counts_for_basefolder(foldername)
|
||||||
|
|
||||||
# Accumulate scanned file data and keys for this base folder.
|
# Accumulate scanned file data and keys for this base folder.
|
||||||
scanned_files = [] # Each entry: (relative_path, basefolder, filename, filetype, transcript, hitcount)
|
scanned_files = [] # Each entry: (relative_path, basefolder, filename, filetype, transcript, hitcount)
|
||||||
current_keys = set()
|
current_keys = set()
|
||||||
dir_files = defaultdict(list) # map of directory -> list of filenames
|
scan_stats = {"dirs": 0, "skipped_dirs": 0, "perm_errors": 0}
|
||||||
for entry in scan_dir(norm_folderpath):
|
transcript_cache = {}
|
||||||
|
transcripts_read = 0
|
||||||
|
transcript_errors = 0
|
||||||
|
|
||||||
|
site = None
|
||||||
|
if foldername == 'Gottesdienste Speyer':
|
||||||
|
site = 'Speyer'
|
||||||
|
elif foldername == 'Gottesdienste Schwegenheim':
|
||||||
|
site = 'Schwegenheim'
|
||||||
|
|
||||||
|
start_time = monotonic()
|
||||||
|
last_log_time = start_time
|
||||||
|
next_log_count = PROGRESS_EVERY_FILES
|
||||||
|
scanned_count = 0
|
||||||
|
|
||||||
|
extract_structure = hf.extract_structure_from_string
|
||||||
|
extract_date = hf.extract_date_from_string
|
||||||
|
|
||||||
|
for entry in scan_dir(norm_folderpath, scan_stats):
|
||||||
transcript = None
|
transcript = None
|
||||||
entry_path = os.path.normpath(entry.path)
|
scanned_count += 1
|
||||||
|
entry_path = entry.path
|
||||||
# Get relative part by slicing if possible.
|
# Get relative part by slicing if possible.
|
||||||
if entry_path.startswith(norm_folderpath):
|
if entry_path.startswith(base_prefix):
|
||||||
rel_part = entry_path[base_len:]
|
rel_part = entry_path[base_len:]
|
||||||
else:
|
else:
|
||||||
rel_part = os.path.relpath(entry_path, norm_folderpath)
|
rel_part = os.path.relpath(entry_path, norm_folderpath)
|
||||||
# Prepend the foldername so it becomes part of the stored relative path.
|
# Prepend the foldername so it becomes part of the stored relative path.
|
||||||
relative_path = os.path.join(foldername, rel_part).replace(os.sep, '/')
|
rel_part = rel_part.replace(os.sep, '/')
|
||||||
filetype = os.path.splitext(entry.name)[1].lower()
|
relative_path = f"{foldername}/{rel_part}"
|
||||||
|
name_root, name_ext = os.path.splitext(entry.name)
|
||||||
|
filetype = name_ext.lower()
|
||||||
|
|
||||||
# Retrieve the hit count for this file from pre-fetched map.
|
# Retrieve the hit count for this file from pre-fetched map.
|
||||||
hit_count = hitcount_map.get(relative_path, 0)
|
hit_count = hitcount_map.get(relative_path, 0)
|
||||||
|
|
||||||
# Determine the site
|
|
||||||
if foldername == 'Gottesdienste Speyer':
|
|
||||||
site = 'Speyer'
|
|
||||||
elif foldername == 'Gottesdienste Schwegenheim':
|
|
||||||
site = 'Schwegenheim'
|
|
||||||
else:
|
|
||||||
site = None
|
|
||||||
|
|
||||||
# Check for a corresponding transcript file in a sibling "Transkription" folder.
|
# Check for a corresponding transcript file in a sibling "Transkription" folder.
|
||||||
parent_dir = os.path.dirname(entry_path)
|
parent_dir = os.path.dirname(entry_path)
|
||||||
transcript_dir = os.path.join(parent_dir, "Transkription")
|
transcript_index = transcript_cache.get(parent_dir)
|
||||||
transcript_filename = os.path.splitext(entry.name)[0] + ".md"
|
if transcript_index is None and parent_dir not in transcript_cache:
|
||||||
transcript_path = os.path.join(transcript_dir, transcript_filename)
|
transcript_dir = os.path.join(parent_dir, TRANSCRIPT_DIRNAME)
|
||||||
if os.path.exists(transcript_path):
|
transcript_index = build_transcript_index(transcript_dir, scan_stats)
|
||||||
try:
|
transcript_cache[parent_dir] = transcript_index
|
||||||
with open(transcript_path, 'r', encoding='utf-8') as tf:
|
if transcript_index:
|
||||||
transcript = tf.read()
|
transcript_path = transcript_index.get(name_root)
|
||||||
except Exception:
|
if transcript_path:
|
||||||
transcript = None
|
try:
|
||||||
|
with open(transcript_path, 'r', encoding='utf-8') as tf:
|
||||||
|
transcript = tf.read()
|
||||||
|
transcripts_read += 1
|
||||||
|
except Exception:
|
||||||
|
transcript_errors += 1
|
||||||
|
|
||||||
category, titel, name = hf.extract_structure_from_string(entry.name)
|
category, titel, name = extract_structure(entry.name)
|
||||||
|
|
||||||
performance_date = hf.extract_date_from_string(relative_path)
|
performance_date = extract_date(relative_path)
|
||||||
|
|
||||||
# Debug: batch file logging per directory
|
|
||||||
dir_files[os.path.dirname(relative_path)].append(entry.name)
|
|
||||||
|
|
||||||
scanned_files.append((relative_path, foldername, entry.name, filetype, category, titel, name, performance_date, site, transcript, hit_count))
|
scanned_files.append((relative_path, foldername, entry.name, filetype, category, titel, name, performance_date, site, transcript, hit_count))
|
||||||
current_keys.add((relative_path, entry.name))
|
current_keys.add((relative_path, entry.name))
|
||||||
|
|
||||||
# After scanning, log grouped files per directory
|
# Light progress output
|
||||||
for d, files in dir_files.items():
|
now = monotonic()
|
||||||
log_directory_batch(d, files)
|
if scanned_count >= next_log_count or (now - last_log_time) >= PROGRESS_EVERY_SECS:
|
||||||
|
elapsed = max(now - start_time, 0.0001)
|
||||||
|
rate = scanned_count / elapsed
|
||||||
|
log(f" progress: {scanned_count} files, {scan_stats['dirs']} dirs, {rate:.1f} files/s")
|
||||||
|
last_log_time = now
|
||||||
|
next_log_count = scanned_count + PROGRESS_EVERY_FILES
|
||||||
|
|
||||||
# Progress indicator
|
# Progress indicator
|
||||||
dir_count = len(dir_files)
|
dir_count = scan_stats["dirs"]
|
||||||
file_count = len(scanned_files)
|
file_count = scanned_count
|
||||||
log(f"Found {dir_count} folders and {file_count} files in '{foldername}'.")
|
elapsed = max(monotonic() - start_time, 0.0001)
|
||||||
|
rate = file_count / elapsed
|
||||||
|
log(f"Scan summary for '{foldername}': {dir_count} dirs, {file_count} files, {rate:.1f} files/s")
|
||||||
|
if scan_stats["skipped_dirs"]:
|
||||||
|
log(f" skipped dirs: {scan_stats['skipped_dirs']}")
|
||||||
|
if scan_stats["perm_errors"]:
|
||||||
|
log(f" permission errors: {scan_stats['perm_errors']}")
|
||||||
|
if transcripts_read or transcript_errors:
|
||||||
|
log(f" transcripts: {transcripts_read} read, {transcript_errors} errors")
|
||||||
log("updating database...")
|
log("updating database...")
|
||||||
|
scan_duration = format_duration(elapsed)
|
||||||
|
|
||||||
# Remove database entries for files under this base folder that are no longer on disk.
|
# Remove database entries for files under this base folder that are no longer on disk.
|
||||||
pattern = foldername + os.sep + '%'
|
pattern = foldername + os.sep + '%'
|
||||||
@ -238,11 +313,14 @@ def updatefileindex():
|
|||||||
|
|
||||||
# Commit changes after processing this base folder.
|
# Commit changes after processing this base folder.
|
||||||
search_db.commit()
|
search_db.commit()
|
||||||
folder_scanned = len(scanned_files)
|
folder_scanned = scanned_count
|
||||||
totals["scanned"] += folder_scanned
|
totals["scanned"] += folder_scanned
|
||||||
log(f"Indexed {folder_scanned} files (deleted {deleted_count}) in '{foldername}'")
|
log(f"Indexed {folder_scanned} files (deleted {deleted_count}) in '{foldername}'")
|
||||||
|
log(f"Scan duration for '{foldername}': {scan_duration}")
|
||||||
|
|
||||||
|
total_elapsed = max(monotonic() - total_start, 0.0001)
|
||||||
log(f"Index update finished: folders={totals['folders']}, files indexed={totals['scanned']}, removed={totals['deleted']}")
|
log(f"Index update finished: folders={totals['folders']}, files indexed={totals['scanned']}, removed={totals['deleted']}")
|
||||||
|
log(f"Total index duration: {format_duration(total_elapsed)}")
|
||||||
return "File index updated successfully"
|
return "File index updated successfully"
|
||||||
|
|
||||||
def convert_dates(search_db,
|
def convert_dates(search_db,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user