index and search all files

This commit is contained in:
lelo 2026-01-23 15:15:42 +00:00
parent f8e705ab20
commit 77d1672efb
5 changed files with 218 additions and 11 deletions

View File

@ -4,6 +4,7 @@ import sqlite3
from datetime import datetime
import re
import helperfunctions as hf
from collections import defaultdict
SEARCH_DB_NAME = 'search.db'
ACCESS_LOG_DB_NAME = 'access_log.db'
@ -17,6 +18,10 @@ search_db.row_factory = sqlite3.Row
access_log_db = sqlite3.connect(f'file:{ACCESS_LOG_DB_NAME}?mode=ro', uri=True)
access_log_db.row_factory = sqlite3.Row
def log(message: str):
"""Small helper to ensure console output is flushed immediately."""
print(message, flush=True)
def init_db():
"""Initializes the database with the required schema."""
cursor = search_db.cursor()
@ -54,13 +59,16 @@ def scan_dir(directory):
with os.scandir(directory) as it:
for entry in it:
if entry.is_dir(follow_symlinks=False):
# Skip transcription directories immediately.
if entry.name.lower() == "transkription":
# Skip unwanted directories immediately.
if entry.name.startswith(('.', '@', '#')):
continue
if entry.name.lower() in {"transkription", ".app", "#recycle"}:
continue
yield from scan_dir(entry.path)
elif entry.is_file(follow_symlinks=False):
yield entry
except PermissionError:
log(f"Permission denied: {directory}")
return
def get_hit_count(relative_path):
@ -71,8 +79,65 @@ def get_hit_count(relative_path):
return row["hit_count"] if row else 0
def get_hit_counts_for_basefolder(basefolder: str) -> dict:
"""Return a map of rel_path -> hit_count for all files under a basefolder."""
cursor = access_log_db.cursor()
pattern = f"{basefolder}/%"
cursor.execute(
"SELECT rel_path, COUNT(*) AS hit_count FROM file_access_log WHERE rel_path LIKE ? GROUP BY rel_path",
(pattern,)
)
return {row["rel_path"]: row["hit_count"] for row in cursor.fetchall()}
def log_structure(root_path, max_depth=None, show_files=False):
"""
Log folder structure up to max_depth levels (root = depth 1).
If max_depth is None, traverse all depths. Files are logged only when show_files is True.
"""
depth_label = "all" if max_depth is None else f"<= {max_depth}"
log(f"Folder structure (depth {depth_label}) for '{root_path}':")
def _walk(path, depth):
if max_depth is not None and depth > max_depth:
return
try:
with os.scandir(path) as it:
entries = sorted(it, key=lambda e: (not e.is_dir(follow_symlinks=False), e.name.lower()))
for entry in entries:
if entry.is_dir(follow_symlinks=False):
if entry.name.startswith(('.', '@', '#')):
continue
if entry.name.lower() in {"transkription"}:
continue
indent = " " * (depth - 1)
log(f"{indent}- {entry.name}/")
_walk(entry.path, depth + 1)
elif show_files:
indent = " " * (depth - 1)
log(f"{indent}- {entry.name}")
except PermissionError:
indent = " " * (depth - 1)
log(f"{indent}- [permission denied]")
_walk(root_path, depth=1)
def log_file(relative_path: str, filename: str):
"""Debug helper to log each file that is indexed."""
log(f" file: {relative_path} ({filename})")
def log_directory_batch(directory: str, files: list[str]):
"""Log file count for a directory without listing filenames."""
if not files:
return
log(f" Dir {directory or '/'}: {len(files)} files")
def updatefileindex():
cursor = search_db.cursor()
totals = {"folders": 0, "scanned": 0, "deleted": 0}
# Load folder configuration from JSON file.
with open(FOLDER_CONFIG, "r", encoding="utf-8") as f:
@ -81,16 +146,21 @@ def updatefileindex():
# Process each configured base folder.
for config in config_data:
for folder in config.get("folders", []):
totals["folders"] += 1
foldername = folder.get("foldername")
print(f"Processing folder: {foldername}")
log(f"Processing folder: {foldername}")
raw_folderpath = folder.get("folderpath")
norm_folderpath = os.path.normpath(raw_folderpath)
log_structure(norm_folderpath, max_depth=None, show_files=False)
# Precompute the length of the base folder path (plus one for the separator)
base_len = len(norm_folderpath) + 1
# Prefetch hit counts for this basefolder to avoid per-file queries
hitcount_map = get_hit_counts_for_basefolder(foldername)
# Accumulate scanned file data and keys for this base folder.
scanned_files = [] # Each entry: (relative_path, basefolder, filename, filetype, transcript, hitcount)
current_keys = set()
dir_files = defaultdict(list) # map of directory -> list of filenames
for entry in scan_dir(norm_folderpath):
transcript = None
entry_path = os.path.normpath(entry.path)
@ -103,12 +173,8 @@ def updatefileindex():
relative_path = os.path.join(foldername, rel_part).replace(os.sep, '/')
filetype = os.path.splitext(entry.name)[1].lower()
if filetype not in ['.mp3', '.wav', '.ogg', '.m4a', '.flac']:
# Skip non-audio files.
continue
# Retrieve the hit count for this file.
hit_count = get_hit_count(relative_path)
# Retrieve the hit count for this file from pre-fetched map.
hit_count = hitcount_map.get(relative_path, 0)
# Determine the site
if foldername == 'Gottesdienste Speyer':
@ -134,15 +200,24 @@ def updatefileindex():
performance_date = hf.extract_date_from_string(relative_path)
# Debug: batch file logging per directory
dir_files[os.path.dirname(relative_path)].append(entry.name)
scanned_files.append((relative_path, foldername, entry.name, filetype, category, titel, name, performance_date, site, transcript, hit_count))
current_keys.add((relative_path, entry.name))
# After scanning, log grouped files per directory
for d, files in dir_files.items():
log_directory_batch(d, files)
# Remove database entries for files under this base folder that are no longer on disk.
pattern = foldername + os.sep + '%'
cursor.execute("SELECT id, relative_path, filename FROM files WHERE relative_path LIKE ?", (pattern,))
db_rows = cursor.fetchall()
keys_in_db = set((row["relative_path"], row["filename"]) for row in db_rows)
keys_to_delete = keys_in_db - current_keys
deleted_count = len(keys_to_delete)
totals["deleted"] += deleted_count
for key in keys_to_delete:
cursor.execute("DELETE FROM files WHERE relative_path = ? AND filename = ?", key)
@ -154,7 +229,11 @@ def updatefileindex():
# Commit changes after processing this base folder.
search_db.commit()
folder_scanned = len(scanned_files)
totals["scanned"] += folder_scanned
log(f"Indexed {folder_scanned} files (deleted {deleted_count}) in '{foldername}'")
log(f"Index update finished: folders={totals['folders']}, files indexed={totals['scanned']}, removed={totals['deleted']}")
return "File index updated successfully"
def convert_dates(search_db,

View File

@ -12,12 +12,23 @@ search_db.row_factory = sqlite3.Row
with open("app_config.json", 'r') as file:
app_config = json.load(file)
FILETYPE_GROUPS = {
'audio': ('.mp3', '.wav', '.ogg', '.m4a', '.flac'),
'video': ('.mp4', '.mov', '.mkv', '.avi', '.webm'),
'image': ('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.tiff')
}
ALL_GROUP_EXTS = tuple(sorted({ext for group in FILETYPE_GROUPS.values() for ext in group}))
def searchcommand():
query = request.form.get("query", "").strip()
category = request.form.get("category", "").strip()
searchfolder = request.form.get("folder", "").strip()
datefrom = request.form.get("datefrom", "").strip()
dateto = request.form.get("dateto", "").strip()
filetypes = [ft.strip().lower() for ft in request.form.getlist("filetype") if ft.strip()]
if not filetypes:
# Default to audio when nothing selected
filetypes = ['audio']
include_transcript = request.form.get("includeTranscript") in ["true", "on"]
words = [w for w in query.split() if w]
@ -72,6 +83,25 @@ def searchcommand():
if datefrom or dateto:
conditions.append("performance_date IS NOT NULL")
# Filetype filters (multiple selection)
selected_groups = [ft for ft in filetypes if ft in FILETYPE_GROUPS]
include_other = 'other' in filetypes
# If not all groups selected, apply filter
if set(filetypes) != {'audio', 'video', 'image', 'other'}:
clauses = []
if selected_groups:
ext_list = tuple({ext for g in selected_groups for ext in FILETYPE_GROUPS[g]})
placeholders = ",".join("?" for _ in ext_list)
clauses.append(f"filetype IN ({placeholders})")
params.extend(ext_list)
if include_other:
placeholders = ",".join("?" for _ in ALL_GROUP_EXTS)
clauses.append(f"(filetype IS NULL OR filetype = '' OR filetype NOT IN ({placeholders}))")
params.extend(ALL_GROUP_EXTS)
if clauses:
conditions.append("(" + " OR ".join(clauses) + ")")
# Build and execute SQL
sql = "SELECT * FROM files"
if conditions:

View File

@ -1169,3 +1169,10 @@ footer .audio-player-container {
color: var(--brand-ink);
border-color: var(--brand-navy);
}
/* Highlight a file when opened from search */
.search-highlight {
outline: 2px solid #f6c344;
background-color: rgba(246, 195, 68, 0.25);
border-radius: 4px;
}

View File

@ -5,17 +5,26 @@ document.addEventListener('DOMContentLoaded', function() {
const resultsDiv = document.getElementById('results');
resultsDiv.innerHTML = '<h5>Suchergebnisse:</h5>';
const audioExts = ['.mp3', '.wav', '.ogg', '.m4a', '.flac'];
if (data.results && data.results.length > 0) {
data.results.forEach(file => {
const card = document.createElement('div');
const filenameWithoutExtension = file.filename.split('.').slice(0, -1).join('.');
const parentFolder = file.relative_path.split('/').slice(0, -1).join('/');
const transcriptURL = '/transcript/' + parentFolder + '/Transkription/' + filenameWithoutExtension + '.md';
const isAudio = audioExts.includes((file.filetype || '').toLowerCase());
const encodedRelPath = encodeURI(file.relative_path);
card.className = 'card';
const fileAction = isAudio
? `<button class="btn btn-light play-audio-btn" onclick="player.loadTrack('${file.relative_path}')" style="width:100%;"><i class="bi bi-volume-up"></i> ${filenameWithoutExtension}</button>`
: `<a class="btn btn-light download-btn" href="/media/${encodedRelPath}" download style="width:100%;"><i class="bi bi-download"></i> ${file.filename}</a>`;
card.innerHTML = `
<div class="card-body">
<p><button class="btn btn-light" onclick="player.loadTrack('${file.relative_path}')" style="width:100%;"><i class="bi bi-volume-up"></i> ${filenameWithoutExtension}</button></p>
<p><button onclick="window.open('/path/${file.relative_path}', '_self');" class="btn btn-light btn-sm" style="width:100%;"><i class="bi bi-folder"></i> ${parentFolder}</button></p>
<p>${fileAction}</p>
<p><button class="btn btn-light btn-sm folder-open-btn" data-folder="${parentFolder}" data-file="${file.relative_path}" style="width:100%;"><i class="bi bi-folder"></i> ${parentFolder || 'Ordner'}</button></p>
<p class="card-text">Anzahl Downloads: ${file.hitcount}</p>
${ file.performance_date !== undefined ? `<p class="card-text">Datum: ${file.performance_date}</p>` : ``}
${ file.transcript_hits !== undefined ? `<p class="card-text">Treffer im Transkript: ${file.transcript_hits} <a href="#" class="show-transcript" data-url="${transcriptURL}" data-audio-url="${file.relative_path}" highlight="${file.query}"><i class="bi bi-journal-text"></i></a></p>` : ``}
@ -24,6 +33,7 @@ document.addEventListener('DOMContentLoaded', function() {
resultsDiv.appendChild(card);
});
attachEventListeners();
attachSearchFolderButtons();
} else {
resultsDiv.innerHTML = '<p>No results found.</p>';
}
@ -55,6 +65,22 @@ document.addEventListener('DOMContentLoaded', function() {
}
}
// Restore previously selected filetypes (multi-select). Default to audio if none stored.
const previousFiletypes = localStorage.getItem("searchFiletypes");
if (previousFiletypes) {
try {
const list = JSON.parse(previousFiletypes);
document.querySelectorAll('input[name=\"filetype\"]').forEach(cb => {
cb.checked = list.includes(cb.value);
});
} catch (e) {
console.error('Error parsing stored filetypes', e);
document.getElementById('filetype-audio').checked = true;
}
} else {
document.getElementById('filetype-audio').checked = true;
}
// Restore the checkbox state for "Im Transkript suchen"
const previousIncludeTranscript = localStorage.getItem("searchIncludeTranscript");
if (previousIncludeTranscript !== null) {
@ -74,6 +100,15 @@ document.addEventListener('DOMContentLoaded', function() {
const categoryRadio = document.querySelector('input[name="category"]:checked');
const category = categoryRadio ? categoryRadio.value : '';
// Get selected filetypes (allow multiple). Default to audio if none selected.
const filetypeCheckboxes = document.querySelectorAll('input[name=\"filetype\"]');
let filetypes = Array.from(filetypeCheckboxes).filter(cb => cb.checked).map(cb => cb.value);
if (filetypes.length === 0) {
// enforce audio as default when user unchecked all
document.getElementById('filetype-audio').checked = true;
filetypes = ['audio'];
}
// Prevent accidental re-selection of already selected radio buttons
const radios = document.querySelectorAll('input[name="category"]');
radios.forEach(radio => {
@ -98,6 +133,7 @@ document.addEventListener('DOMContentLoaded', function() {
formData.append('datefrom', document.getElementById('datefrom').value);
formData.append('dateto', document.getElementById('dateto').value);
formData.append('includeTranscript', includeTranscript);
filetypes.forEach(ft => formData.append('filetype', ft));
const settleSpinner = () => {
clearTimeout(spinnerTimer);
@ -121,6 +157,7 @@ document.addEventListener('DOMContentLoaded', function() {
// Save the search word, selected category, and checkbox state in localStorage
localStorage.setItem("searchQuery", query);
localStorage.setItem("searchCategory", category);
localStorage.setItem("searchFiletypes", JSON.stringify(filetypes));
localStorage.setItem("searchIncludeTranscript", includeTranscript);
})
.catch(error => {
@ -140,6 +177,7 @@ document.addEventListener('DOMContentLoaded', function() {
localStorage.removeItem("searchResponse");
localStorage.removeItem("searchQuery");
localStorage.removeItem("searchCategory");
localStorage.removeItem("searchFiletypes");
localStorage.removeItem("folder");
localStorage.removeItem("datefrom");
localStorage.removeItem("dateto");
@ -149,6 +187,9 @@ document.addEventListener('DOMContentLoaded', function() {
document.querySelector('input[name="category"][value=""]').checked = true;
const otherRadios = document.querySelectorAll('input[name="category"]:not([value=""])');
otherRadios.forEach(radio => radio.checked = false);
document.getElementById('filetype-audio').checked = true;
const otherFiletypeBoxes = document.querySelectorAll('input[name=\"filetype\"]:not([value=\"audio\"])');
otherFiletypeBoxes.forEach(cb => cb.checked = false);
document.getElementById('folder').value = ''; // Reset to "Alle"
document.getElementById('datefrom').value = ''; // Reset date from
document.getElementById('dateto').value = ''; // Reset date to
@ -164,6 +205,30 @@ document.addEventListener('DOMContentLoaded', function() {
});
});
function attachSearchFolderButtons() {
document.querySelectorAll('.folder-open-btn').forEach(btn => {
btn.addEventListener('click', (e) => {
e.preventDefault();
const folder = btn.dataset.folder;
const file = btn.dataset.file;
openFolderAndHighlight(folder, file);
});
});
}
function openFolderAndHighlight(folderPath, filePath) {
const targetFolder = folderPath || '';
// Switch back to main view before loading folder
viewMain();
loadDirectory(targetFolder).then(() => {
const target = document.querySelector(`.play-file[data-url=\"${filePath}\"]`);
if (target) {
target.classList.add('search-highlight');
target.scrollIntoView({ behavior: 'smooth', block: 'center' });
}
});
}
function syncThemeColor() {
// read the CSS variable from :root (or any selector)
const cssVar = getComputedStyle(document.documentElement)

View File

@ -126,6 +126,32 @@
<hr>
<!-- Dateityp -->
<div class="mb-3">
<label class="form-label">Dateityp:</label>
<div class="d-flex flex-wrap gap-3">
<div class="form-check">
<input class="form-check-input" type="checkbox" name="filetype" id="filetype-audio" value="audio" checked>
<label class="form-check-label" for="filetype-audio">Audio</label>
</div>
<div class="form-check">
<input class="form-check-input" type="checkbox" name="filetype" id="filetype-video" value="video">
<label class="form-check-label" for="filetype-video">Video</label>
</div>
<div class="form-check">
<input class="form-check-input" type="checkbox" name="filetype" id="filetype-image" value="image">
<label class="form-check-label" for="filetype-image">Bild</label>
</div>
<div class="form-check">
<input class="form-check-input" type="checkbox" name="filetype" id="filetype-other" value="other">
<label class="form-check-label" for="filetype-other">Sonstige</label>
</div>
</div>
<small class="text-muted">Mehrfachauswahl möglich!</small>
</div>
<hr>
<!-- Transkript durchsuchen -->
<div class="form-check mb-3">
<input type="checkbox" class="form-check-input" id="includeTranscript" name="includeTranscript">