diff --git a/analytics.py b/analytics.py index b42502a..49a8a71 100644 --- a/analytics.py +++ b/analytics.py @@ -3,6 +3,8 @@ from flask import render_template, request, session from datetime import datetime, timedelta, timezone import geoip2.database from auth import require_secret +from collections import defaultdict +import json import os file_access_temp = [] @@ -12,6 +14,7 @@ DB_NAME = 'access_log.db' # Create a single global connection to SQLite log_db = sqlite3.connect(DB_NAME, check_same_thread=False) +search_db = sqlite3.connect("search.db", check_same_thread=False) # geo location geoReader = geoip2.database.Reader('GeoLite2-City.mmdb') @@ -137,6 +140,57 @@ def return_file_access(): else: return [] +def songs_dashboard(): + if 'songs_dashboard_timeframe' not in session: + session['songs_dashboard_timeframe'] = "30" + timeframe_param = request.args.get("timeframe", session['songs_dashboard_timeframe']) + session['songs_dashboard_timeframe'] = timeframe_param + + if 'songs_dashboard_category' not in session: + session['songs_dashboard_category'] = "Gemeinsamer Gesang" + category = request.args.get("category", session['songs_dashboard_category']) + session['songs_dashboard_category'] = category + + if 'songs_dashboard_site' not in session: + session['songs_dashboard_site'] = "Speyer" + site = request.args.get("site", session['songs_dashboard_site']) + session['songs_dashboard_site'] = site + + # Determine cutoff_date based on the days parameter + if timeframe_param == "all": + cutoff_date = None # No date filtering when analyzing all time + timeframe = "all" # Pass the string to the template if needed + else: + timeframe = int(timeframe_param) + now = datetime.now() + cutoff_date = now - timedelta(days=timeframe) + + cursor = search_db.cursor() + # Query rows with category "Gemeinsamer Gesang" + query = "SELECT titel, performance_date FROM files WHERE category = ? and site = ?" + cursor.execute(query, (category, site)) + rows = cursor.fetchall() + + # Group and count performances per titel (only if performance_date is within the timeframe, + # or count all if cutoff_date is None) + performance_counts = defaultdict(int) + for titel, performance_date in rows: + if performance_date: + try: + # Convert date from "dd.mm.yyyy" format + date_obj = datetime.strptime(performance_date, "%d.%m.%Y") + except ValueError: + continue + # If cutoff_date is None, count all dates; otherwise, filter by cutoff_date. + if cutoff_date is None or date_obj >= cutoff_date: + performance_counts[titel] += 1 + + # Create a list of tuples: (count, titel), sorted in descending order by count. + performance_data = [(count, titel) for titel, count in performance_counts.items()] + performance_data.sort(reverse=True, key=lambda x: x[0]) + + return render_template('songs_dashboard.html', timeframe=timeframe_param, performance_data=performance_data, site=site, category=category) + @require_secret def connections(): return render_template('connections.html') @@ -253,7 +307,7 @@ def dashboard(): dict(bucket=r[0], count=r[1]) for r in distinct_device_data_rows ] - # 3. session['timeframe']-based aggregation + # 3. Download trend # We'll group by hour if "today", by day if "7days"/"30days", by month if "365days". if session['timeframe'] == 'last24hours': # Hour: substr(timestamp, 12, 2) -> HH @@ -343,7 +397,7 @@ def dashboard(): SELECT city, country, COUNT(*) as count FROM file_access_log WHERE timestamp >= ? {filetype_filter_sql} - GROUP BY city + GROUP BY city, country ORDER BY count DESC ''' with log_db: @@ -397,7 +451,7 @@ def dashboard(): # 8. Process location data location_data_dict = {} for (city, country, cnt) in locations: - key = (country, city) + key = (city, country) location_data_dict[key] = location_data_dict.get(key, 0) + cnt location_data = [ @@ -424,3 +478,31 @@ def dashboard(): cached_percentage=cached_percentage, timeframe_data=timeframe_data ) + +def export_to_excel(): + """Export search_db to an Excel file and store it locally.""" + import pandas as pd + + # Query all data from the search_db + query = "SELECT * FROM files" + cursor = search_db.cursor() + cursor.execute(query) + rows = cursor.fetchall() + + # Get column names from the cursor description + column_names = [description[0] for description in cursor.description] + + # Create a DataFrame and save it to an Excel file + df = pd.DataFrame(rows, columns=column_names) + df = df.drop(columns=['transcript'], errors='ignore') # Drop the 'id' column if it exists + df.to_excel("search_db.xlsx", index=False) + + # Close the cursor and database connection + cursor.close() + +if __name__ == "__main__": + print("Running as a standalone script.") + export_to_excel() + print("Exported search_db to search_db.xlsx") + + \ No newline at end of file diff --git a/app.py b/app.py index 1ab4b54..e678b70 100755 --- a/app.py +++ b/app.py @@ -41,6 +41,8 @@ app.add_url_rule('/remove_secret', view_func=auth.remove_secret, methods=['POST' app.add_url_rule('/search', view_func=search.search, methods=['GET']) app.add_url_rule('/searchcommand', view_func=search.searchcommand, methods=['POST']) +app.add_url_rule('/songs_dashboard', view_func=a.songs_dashboard) + # Grab the HOST_RULE environment variable host_rule = os.getenv("HOST_RULE", "") diff --git a/index_for_search.py b/index_for_search.py old mode 100644 new mode 100755 index 08e19d0..6a157ed --- a/index_for_search.py +++ b/index_for_search.py @@ -1,9 +1,12 @@ import os import json import sqlite3 +from datetime import datetime +import re SEARCH_DB_NAME = 'search.db' ACCESS_LOG_DB_NAME = 'access_log.db' +FOLDER_CONFIG = 'folder_permission_config.json' # Connect to the search database. search_db = sqlite3.connect(SEARCH_DB_NAME, check_same_thread=False) @@ -24,6 +27,11 @@ def init_db(): basefolder TEXT, filename TEXT, filetype TEXT, + category TEXT, + titel TEXT, + name TEXT, + performance_date TEXT, + site TEXT, transcript TEXT, hitcount INTEGER DEFAULT 0, UNIQUE(relative_path, filename) @@ -41,6 +49,31 @@ def init_db(): except sqlite3.OperationalError: # Likely the column already exists, so we ignore this error. pass + try: + cursor.execute("ALTER TABLE files ADD COLUMN category TEXT") + except sqlite3.OperationalError: + # Likely the column already exists, so we ignore this error. + pass + try: + cursor.execute("ALTER TABLE files ADD COLUMN titel TEXT") + except sqlite3.OperationalError: + # Likely the column already exists, so we ignore this error. + pass + try: + cursor.execute("ALTER TABLE files ADD COLUMN name TEXT") + except sqlite3.OperationalError: + # Likely the column already exists, so we ignore this error. + pass + try: + cursor.execute("ALTER TABLE files ADD COLUMN performance_date TEXT") + except sqlite3.OperationalError: + # Likely the column already exists, so we ignore this error. + pass + try: + cursor.execute("ALTER TABLE files ADD COLUMN site TEXT") + except sqlite3.OperationalError: + # Likely the column already exists, so we ignore this error. + pass search_db.commit() def scan_dir(directory): @@ -69,7 +102,7 @@ def updatefileindex(): cursor = search_db.cursor() # Load folder configuration from JSON file. - with open("folder_config.json", "r", encoding="utf-8") as f: + with open(FOLDER_CONFIG, "r", encoding="utf-8") as f: config_data = json.load(f) # Process each configured base folder. @@ -85,7 +118,6 @@ def updatefileindex(): # Accumulate scanned file data and keys for this base folder. scanned_files = [] # Each entry: (relative_path, basefolder, filename, filetype, transcript, hitcount) current_keys = set() - for entry in scan_dir(norm_folderpath): entry_path = os.path.normpath(entry.path) # Get relative part by slicing if possible. @@ -96,24 +128,92 @@ def updatefileindex(): # Prepend the foldername so it becomes part of the stored relative path. relative_path = os.path.join(foldername, rel_part).replace(os.sep, '/') filetype = os.path.splitext(entry.name)[1].lower() - transcript = None - - # Check for a corresponding transcript file in a sibling "Transkription" folder. - parent_dir = os.path.dirname(entry_path) - transcript_dir = os.path.join(parent_dir, "Transkription") - transcript_filename = os.path.splitext(entry.name)[0] + ".md" - transcript_path = os.path.join(transcript_dir, transcript_filename) - if os.path.exists(transcript_path): - try: - with open(transcript_path, 'r', encoding='utf-8') as tf: - transcript = tf.read() - except Exception: - transcript = None - + # Retrieve the hit count for this file. hit_count = get_hit_count(relative_path) + + category, titel, name, performance_date, site = None, None, None, None, None + + # Determine the site + if foldername == 'Gottesdienste Speyer': + site = 'Speyer' + elif foldername == 'Gottesdienste Schwegenheim': + site = 'Schwegenheim' + + if filetype == '.mp3': + transcript = None - scanned_files.append((relative_path, foldername, entry.name, filetype, transcript, hit_count)) + # Check for a corresponding transcript file in a sibling "Transkription" folder. + parent_dir = os.path.dirname(entry_path) + transcript_dir = os.path.join(parent_dir, "Transkription") + transcript_filename = os.path.splitext(entry.name)[0] + ".md" + transcript_path = os.path.join(transcript_dir, transcript_filename) + if os.path.exists(transcript_path): + try: + with open(transcript_path, 'r', encoding='utf-8') as tf: + transcript = tf.read() + except Exception: + transcript = None + + # extract category and titel from filename + filename_ext = os.path.splitext(entry.name)[0] + left_side, right_side = filename_ext.split('-', 1) if '-' in filename_ext else (filename_ext, None) + try: + int(left_side.strip()) + # first part is only a number + previous_right_side = right_side + left_side, right_side = previous_right_side.split('-', 1) if '-' in previous_right_side else (previous_right_side, None) + except: + # first part not a number + continue + + if 'predig' in left_side.lower(): + category = 'Predigt' + elif 'wort' in left_side.lower() or 'einladung' in left_side.lower(): + category = 'Vorwort' + elif 'chor' in left_side.lower(): + category = 'Chor' + elif 'orchester' in left_side.lower(): + category = 'Orchester' + elif 'gruppenlied' in left_side.lower() or 'jugendlied' in left_side.lower(): + category = 'Gruppenlied' + elif 'gemeinsam' in left_side.lower() or 'gesang' in left_side.lower() or 'lied' in left_side.lower(): + category = 'Gemeinsamer Gesang' + elif 'gedicht' in left_side.lower(): + category = 'Gedicht' + elif 'instrumental' in left_side.lower() or 'musikstück' in left_side.lower(): + category = 'Instrumental' + else: + category = None + + if right_side: + titel, name = right_side.split('-', 1) if '-' in right_side else (right_side, None) + if category == 'Predigt' or category == 'Vorwort' or category == 'Gedicht': + if not name: # kein Titel, nur Name + name = titel + titel = None + else: + titel = None + name = None + + # extract the date from path using regex (dd.mm.yyyy or dd.mm.yy) + date_match = re.search(r'(\d{1,2}\.\d{1,2}\.\d{2,4})', relative_path) + if date_match: + date_str = date_match.group(1) + # Convert to YYYY-MM-DD format + try: + date_obj = datetime.strptime(date_str, '%d.%m.%Y') + performance_date = date_obj.strftime('%d.%m.%Y') + except ValueError: + try: + date_obj = datetime.strptime(date_str, '%d.%m.%y') + performance_date = date_obj.strftime('%d.%m.%Y') + except ValueError: + performance_date = None + else: + performance_date = None + + scanned_files.append((relative_path, foldername, entry.name, filetype, category, titel, name, performance_date, site, transcript, hit_count)) current_keys.add((relative_path, entry.name)) # Remove database entries for files under this base folder that are no longer on disk. @@ -127,7 +227,7 @@ def updatefileindex(): # Bulk write the scanned files using INSERT OR REPLACE. cursor.executemany( - "INSERT OR REPLACE INTO files (relative_path, basefolder, filename, filetype, transcript, hitcount) VALUES (?, ?, ?, ?, ?, ?)", + "INSERT OR REPLACE INTO files (relative_path, basefolder, filename, filetype, category, titel, name, performance_date, site, transcript, hitcount) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", scanned_files ) diff --git a/requirements.txt b/requirements.txt index 4de8d00..e46e3fc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,5 +4,7 @@ pillow qrcode diskcache geoip2 +pandas +openpyxl gunicorn eventlet diff --git a/search.py b/search.py index 06377be..d4025b9 100644 --- a/search.py +++ b/search.py @@ -1,6 +1,7 @@ import sqlite3 from flask import Flask, render_template, request, request, jsonify, session import os +import random app = Flask(__name__) @@ -39,10 +40,14 @@ def searchcommand(): sql = "SELECT * FROM files" if conditions: sql += " WHERE " + " AND ".join(conditions) - sql += " ORDER BY hitcount DESC" + cursor.execute(sql, params) raw_results = cursor.fetchall() results = [dict(row) for row in raw_results] + + # Randomize the list before sorting to break ties randomly. + random.shuffle(results) + results.sort(key=lambda x: x["hitcount"], reverse=True) else: # Advanced search: include transcript. Count transcript hits. @@ -74,9 +79,11 @@ def searchcommand(): transcript = result.get("transcript") or "" total_hits = sum(transcript.lower().count(word.lower()) for word in words) result["transcript_hits"] = total_hits - result.pop("transcript") + result.pop("transcript", None) results.append(result) - # Sort so that files with more transcript hits appear first + + # Randomize the list before sorting to break ties randomly. + random.shuffle(results) results.sort(key=lambda x: x["transcript_hits"], reverse=True) results = results[:100] diff --git a/templates/connections.html b/templates/connections.html index ad0e15d..be84e10 100644 --- a/templates/connections.html +++ b/templates/connections.html @@ -67,18 +67,11 @@ Downloads der letzten 10 Minuten
diff --git a/templates/dashboard.html b/templates/dashboard.html index e22f2a5..edfa580 100644 --- a/templates/dashboard.html +++ b/templates/dashboard.html @@ -31,7 +31,8 @@