add hitcount to search index

This commit is contained in:
lelo 2025-04-05 09:19:33 +02:00
parent 1fc51d578e
commit 5ae6a1dea2
3 changed files with 32 additions and 7 deletions

1
.gitignore vendored
View File

@ -7,6 +7,7 @@
/postgres_data /postgres_data
/instance /instance
/__pycache__ /__pycache__
/search.db
/access_log.db /access_log.db
/access_log.db.bak /access_log.db.bak
/folder_config.json /folder_config.json

View File

@ -3,13 +3,20 @@ import json
import sqlite3 import sqlite3
SEARCH_DB_NAME = 'search.db' SEARCH_DB_NAME = 'search.db'
ACCESS_LOG_DB_NAME = 'access_log.db'
# Connect to the search database.
search_db = sqlite3.connect(SEARCH_DB_NAME, check_same_thread=False) search_db = sqlite3.connect(SEARCH_DB_NAME, check_same_thread=False)
search_db.row_factory = sqlite3.Row search_db.row_factory = sqlite3.Row
# Open access_log.db in read-only mode.
access_log_db = sqlite3.connect(f'file:{ACCESS_LOG_DB_NAME}?mode=ro', uri=True)
access_log_db.row_factory = sqlite3.Row
def init_db(): def init_db():
"""Initializes the database with the required schema.""" """Initializes the database with the required schema."""
cursor = search_db.cursor() cursor = search_db.cursor()
# Create table with the new 'hitcount' column.
cursor.execute(''' cursor.execute('''
CREATE TABLE IF NOT EXISTS files ( CREATE TABLE IF NOT EXISTS files (
id INTEGER PRIMARY KEY AUTOINCREMENT, id INTEGER PRIMARY KEY AUTOINCREMENT,
@ -17,10 +24,18 @@ def init_db():
filename TEXT, filename TEXT,
filetype TEXT, filetype TEXT,
transcript TEXT, transcript TEXT,
hitcount INTEGER DEFAULT 0,
UNIQUE(relative_path, filename) UNIQUE(relative_path, filename)
) )
''') ''')
search_db.commit() search_db.commit()
# If the table already existed, try to add the 'hitcount' column.
try:
cursor.execute("ALTER TABLE files ADD COLUMN hitcount INTEGER DEFAULT 0")
except sqlite3.OperationalError:
# Likely the column already exists, so we ignore this error.
pass
search_db.commit()
def scan_dir(directory): def scan_dir(directory):
"""Recursively scan directories using os.scandir for improved performance.""" """Recursively scan directories using os.scandir for improved performance."""
@ -37,6 +52,13 @@ def scan_dir(directory):
except PermissionError: except PermissionError:
return return
def get_hit_count(relative_path):
"""Returns the hit count for a given file from the access log database."""
cursor = access_log_db.cursor()
cursor.execute("SELECT COUNT(*) AS hit_count FROM file_access_log WHERE rel_path = ?", (relative_path,))
row = cursor.fetchone()
return row["hit_count"] if row else 0
def updatefileindex(): def updatefileindex():
cursor = search_db.cursor() cursor = search_db.cursor()
@ -54,7 +76,7 @@ def updatefileindex():
base_len = len(norm_folderpath) + 1 base_len = len(norm_folderpath) + 1
# Accumulate scanned file data and keys for this base folder. # Accumulate scanned file data and keys for this base folder.
scanned_files = [] # Each entry: (relative_path, filename, filetype, transcript) scanned_files = [] # Each entry: (relative_path, filename, filetype, transcript, hitcount)
current_keys = set() current_keys = set()
for entry in scan_dir(norm_folderpath): for entry in scan_dir(norm_folderpath):
@ -66,7 +88,6 @@ def updatefileindex():
rel_part = os.path.relpath(entry_path, norm_folderpath) rel_part = os.path.relpath(entry_path, norm_folderpath)
# Prepend the foldername so it becomes part of the stored relative path. # Prepend the foldername so it becomes part of the stored relative path.
relative_path = os.path.join(foldername, rel_part).replace(os.sep, '/') relative_path = os.path.join(foldername, rel_part).replace(os.sep, '/')
print(relative_path)
filetype = os.path.splitext(entry.name)[1].lower() filetype = os.path.splitext(entry.name)[1].lower()
transcript = None transcript = None
@ -82,7 +103,10 @@ def updatefileindex():
except Exception: except Exception:
transcript = None transcript = None
scanned_files.append((relative_path, entry.name, filetype, transcript)) # Retrieve the hit count for this file.
hit_count = get_hit_count(relative_path)
scanned_files.append((relative_path, entry.name, filetype, transcript, hit_count))
current_keys.add((relative_path, entry.name)) current_keys.add((relative_path, entry.name))
# Remove database entries for files under this base folder that are no longer on disk. # Remove database entries for files under this base folder that are no longer on disk.
@ -96,7 +120,7 @@ def updatefileindex():
# Bulk write the scanned files using INSERT OR REPLACE. # Bulk write the scanned files using INSERT OR REPLACE.
cursor.executemany( cursor.executemany(
"INSERT OR REPLACE INTO files (relative_path, filename, filetype, transcript) VALUES (?, ?, ?, ?)", "INSERT OR REPLACE INTO files (relative_path, filename, filetype, transcript, hitcount) VALUES (?, ?, ?, ?, ?)",
scanned_files scanned_files
) )
@ -105,9 +129,9 @@ def updatefileindex():
return "File index updated successfully" return "File index updated successfully"
if __name__ == "__main__": if __name__ == "__main__":
init_db() # Initialize the database schema if it doesn't exist init_db() # Initialize the database schema if it doesn't exist
updatefileindex() # Update the file index updatefileindex() # Update the file index
search_db.close() # Close the database connection search_db.close() # Close the search database connection
print("Database connection closed.") access_log_db.close() # Close the access log connection
print("Database connections closed.")

BIN
search.db

Binary file not shown.