From 76eca80a4a6150743d4317e7b1a6686727ac43d1 Mon Sep 17 00:00:00 2001 From: lelo Date: Mon, 31 Mar 2025 22:06:32 +0000 Subject: [PATCH] back to sqlite --- analytics.py | 396 +++++++++++++++++++++++++-------------------- docker-compose.yml | 22 --- requirements.txt | 1 - 3 files changed, 220 insertions(+), 199 deletions(-) diff --git a/analytics.py b/analytics.py index a876cb9..805df9f 100644 --- a/analytics.py +++ b/analytics.py @@ -1,37 +1,27 @@ +import sqlite3 from flask import render_template, request from datetime import datetime, timedelta import geoip2.database from auth import require_secret import os -import psycopg2 file_access_temp = [] -dbname = os.environ.get('DB_NAME') -user = os.environ.get('DB_USER') -password = os.environ.get('DB_PASSWORD') -host = os.environ.get('DB_HOST') -port = int(os.environ.get('DB_PORT', 5432)) +# Example database name; you can change to whatever you want: +DB_NAME = 'access_log.db' -connection = psycopg2.connect(dbname=dbname, - user=user, - password=password, - host=host, - port=port - ) -# Enable autocommit -connection.autocommit = True -log_db = connection +# Create a single global connection to SQLite +log_db = sqlite3.connect(DB_NAME, check_same_thread=False) -# Function to initialize the database. def init_log_db(): - with log_db.cursor() as cursor: - cursor.execute(''' + """Create the file_access_log table if it doesn't already exist.""" + with log_db: + log_db.execute(''' CREATE TABLE IF NOT EXISTS file_access_log ( - id SERIAL PRIMARY KEY, - timestamp TIMESTAMP, + id INTEGER PRIMARY KEY AUTOINCREMENT, + timestamp TEXT, rel_path TEXT, - filesize BIGINT, + filesize INTEGER, mime TEXT, ip_address TEXT, user_agent TEXT, @@ -42,8 +32,6 @@ def init_log_db(): init_log_db() - - def lookup_location(ip, reader): try: response = reader.city(ip) @@ -54,7 +42,7 @@ def lookup_location(ip, reader): return "Unknown", "Unknown" def get_device_type(user_agent): - "Classify device type based on user agent string" + """Classify device type based on user agent string.""" if 'Android' in user_agent: return 'Android' elif 'iPhone' in user_agent or 'iPad' in user_agent: @@ -68,22 +56,30 @@ def get_device_type(user_agent): else: return 'Other' -# Logging function that uses the singleton connection. def log_file_access(rel_path, filesize, mime, ip_address, user_agent, device_id, cached): + """Insert a file access record into the database.""" global file_access_temp - timestamp = datetime.now() # Use datetime object directly - with log_db.connection.cursor() as cursor: - cursor.execute(''' - INSERT INTO file_access_log (timestamp, rel_path, filesize, mime, ip_address, user_agent, device_id, cached) - VALUES (%s, %s, %s, %s, %s, %s, %s, %s) - ''', (timestamp, rel_path, filesize, mime, ip_address, user_agent, device_id, cached)) - file_access_temp.insert(0, [timestamp.isoformat(), rel_path, filesize, mime, ip_address, user_agent, device_id, cached]) - return timestamp.isoformat() + timestamp = datetime.now() # a datetime object + + # Store the ISO timestamp in the database for easy lexical comparison + iso_ts = timestamp.isoformat() + + with log_db: + log_db.execute(''' + INSERT INTO file_access_log + (timestamp, rel_path, filesize, mime, ip_address, user_agent, device_id, cached) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + ''', (iso_ts, rel_path, filesize, mime, ip_address, user_agent, device_id, cached)) + file_access_temp.insert(0, [iso_ts, rel_path, filesize, mime, ip_address, user_agent, device_id, cached]) + + return iso_ts def return_file_access(): + """Return recent file access logs from memory (the last 10 minutes).""" global file_access_temp if file_access_temp: cutoff_time = datetime.now() - timedelta(minutes=10) + # Convert each stored timestamp (ISO string) back to datetime file_access_temp[:] = [ entry for entry in file_access_temp if datetime.fromisoformat(entry[0]) >= cutoff_time @@ -104,187 +100,235 @@ def dashboard(): # Determine which file type we're filtering by. filetype = 'other' - - allowed_list = ['mp3', 'wav', 'audio'] - if filetype_arg.lower() in allowed_list: + + # Some simplistic sets to decide how we match the MIME type + audio_list = ['mp3', 'wav', 'audio'] + image_list = ['jpg', 'jpeg', 'image', 'photo'] + video_list = ['mp4', 'mov', 'wmv', 'avi'] + + if filetype_arg.lower() in audio_list: filetype = 'audio/' - - allowed_list = ['jpg', 'jpeg', 'image', 'photo'] - if filetype_arg.lower() in allowed_list: + elif filetype_arg.lower() in image_list: filetype = 'image/' - - allowed_list = ['mp4', 'mov', 'wmv', 'avi'] - if filetype_arg.lower() in allowed_list: + elif filetype_arg.lower() in video_list: filetype = 'video/' - - - # Determine the start time based on timeframe. + # Determine start time based on timeframe if timeframe == 'today': - start = now.replace(hour=0, minute=0, second=0, microsecond=0) + start_dt = now.replace(hour=0, minute=0, second=0, microsecond=0) elif timeframe == '7days': - start = now - timedelta(days=7) + start_dt = now - timedelta(days=7) elif timeframe == '30days': - start = now - timedelta(days=30) + start_dt = now - timedelta(days=30) elif timeframe == '365days': - start = now - timedelta(days=365) + start_dt = now - timedelta(days=365) else: - start = now.replace(hour=0, minute=0, second=0, microsecond=0) + start_dt = now.replace(hour=0, minute=0, second=0, microsecond=0) - # Build the SQL filter for mime + # We'll compare the textual timestamp (ISO 8601). + start_str = start_dt.isoformat() + + # Build the SQL filter if filetype == 'other': - # Exclude audio, image, and video mimes - filetype_filter_sql = "AND mime NOT LIKE 'audio/%' AND mime NOT LIKE 'image/%' AND mime NOT LIKE 'video/%'" - params = (start,) + # Exclude audio, image, video + filetype_filter_sql = ( + "AND mime NOT LIKE 'audio/%' " + "AND mime NOT LIKE 'image/%' " + "AND mime NOT LIKE 'video/%' " + ) + params_for_filter = (start_str,) else: - # Filter for mimes that start with the given type. - filetype_filter_sql = "AND mime LIKE %s" - params = (start, filetype + '%') + # Filter for mimes that start with the given type + filetype_filter_sql = "AND mime LIKE ?" + params_for_filter = (start_str, filetype + '%') - with log_db.connection.cursor() as cursor: - # Raw file access counts (top files) - query = f''' - SELECT rel_path, COUNT(*) as access_count - FROM file_access_log - WHERE timestamp >= %s {filetype_filter_sql} - GROUP BY rel_path - ORDER BY access_count DESC - LIMIT 20 - ''' - cursor.execute(query, params) + # 1. Top files by access count + query = f''' + SELECT rel_path, COUNT(*) as access_count + FROM file_access_log + WHERE timestamp >= ? {filetype_filter_sql} + GROUP BY rel_path + ORDER BY access_count DESC + LIMIT 20 + ''' + with log_db: + cursor = log_db.execute(query, params_for_filter) rows = cursor.fetchall() - # Daily access trend for a line chart - query = f''' - SELECT CAST(timestamp AS DATE) as date, COUNT(*) as count - FROM file_access_log - WHERE timestamp >= %s {filetype_filter_sql} - GROUP BY CAST(timestamp AS DATE) - ORDER BY date - ''' - cursor.execute(query, params) - daily_access_data = [dict(date=str(row[0]), count=row[1]) for row in cursor.fetchall()] + # 2. Daily access trend (line chart) + # We'll group by day using substr(timestamp, 1, 10) -> YYYY-MM-DD + query = f''' + SELECT substr(timestamp, 1, 10) AS date, COUNT(*) AS count + FROM file_access_log + WHERE timestamp >= ? {filetype_filter_sql} + GROUP BY date + ORDER BY date + ''' + with log_db: + cursor = log_db.execute(query, params_for_filter) + daily_rows = cursor.fetchall() + daily_access_data = [ + dict(date=r[0], count=r[1]) for r in daily_rows + ] - # Aggregate download counts by time bucket according to the timeframe. - if timeframe == 'today': - query = f''' - SELECT to_char(timestamp, 'HH24') as bucket, COUNT(*) as count - FROM file_access_log - WHERE timestamp >= %s {filetype_filter_sql} - GROUP BY bucket - ORDER BY bucket - ''' - cursor.execute(query, params) - elif timeframe in ('7days', '30days'): - query = f''' - SELECT CAST(timestamp AS DATE) as bucket, COUNT(*) as count - FROM file_access_log - WHERE timestamp >= %s {filetype_filter_sql} - GROUP BY bucket - ORDER BY bucket - ''' - cursor.execute(query, params) - elif timeframe == '365days': - query = f''' - SELECT to_char(timestamp, 'YYYY-MM') as bucket, COUNT(*) as count - FROM file_access_log - WHERE timestamp >= %s {filetype_filter_sql} - GROUP BY bucket - ORDER BY bucket - ''' - cursor.execute(query, params) - else: - query = f''' - SELECT CAST(timestamp AS DATE) as bucket, COUNT(*) as count - FROM file_access_log - WHERE timestamp >= %s {filetype_filter_sql} - GROUP BY bucket - ORDER BY bucket - ''' - cursor.execute(query, params) - timeframe_data = [dict(bucket=row[0], count=row[1]) for row in cursor.fetchall()] - - # User agent distribution (aggregate by device type) + # 3. Timeframe-based aggregation + # We'll group by hour if "today", by day if "7days"/"30days", by month if "365days". + if timeframe == 'today': + # Hour: substr(timestamp, 12, 2) -> HH query = f''' - SELECT user_agent, COUNT(*) as count + SELECT substr(timestamp, 12, 2) AS bucket, COUNT(*) AS count FROM file_access_log - WHERE timestamp >= %s {filetype_filter_sql} - GROUP BY user_agent - ORDER BY count DESC + WHERE timestamp >= ? {filetype_filter_sql} + GROUP BY bucket + ORDER BY bucket ''' - cursor.execute(query, params) - raw_user_agents = [dict(user_agent=row[0], count=row[1]) for row in cursor.fetchall()] - device_counts = {} - for entry in raw_user_agents: - device = get_device_type(entry['user_agent']) - device_counts[device] = device_counts.get(device, 0) + entry['count'] - user_agent_data = [dict(device=device, count=count) for device, count in device_counts.items()] - - # Parent folder distribution + elif timeframe in ('7days', '30days'): + # Day: substr(timestamp, 1, 10) -> YYYY-MM-DD query = f''' - SELECT rel_path, COUNT(*) as count + SELECT substr(timestamp, 1, 10) AS bucket, COUNT(*) AS count FROM file_access_log - WHERE timestamp >= %s {filetype_filter_sql} - GROUP BY rel_path - ORDER BY count DESC + WHERE timestamp >= ? {filetype_filter_sql} + GROUP BY bucket + ORDER BY bucket ''' - cursor.execute(query, params) - folder_data = {} - for row in cursor.fetchall(): - rel_path = row[0] - parent_folder = rel_path.rsplit('/', 1)[0] if '/' in rel_path else "Root" - folder_data[parent_folder] = folder_data.get(parent_folder, 0) + row[1] - folder_data = [dict(folder=folder, count=count) for folder, count in folder_data.items()] - folder_data.sort(key=lambda x: x['count'], reverse=True) - folder_data = folder_data[:10] - - # Aggregate IP addresses with counts + elif timeframe == '365days': + # Month: substr(timestamp, 1, 7) -> YYYY-MM query = f''' - SELECT ip_address, COUNT(*) as count + SELECT substr(timestamp, 1, 7) AS bucket, COUNT(*) AS count FROM file_access_log - WHERE timestamp >= %s {filetype_filter_sql} - GROUP BY ip_address - ORDER BY count DESC + WHERE timestamp >= ? {filetype_filter_sql} + GROUP BY bucket + ORDER BY bucket ''' - cursor.execute(query, params) + else: + # Default: group by day + query = f''' + SELECT substr(timestamp, 1, 10) AS bucket, COUNT(*) AS count + FROM file_access_log + WHERE timestamp >= ? {filetype_filter_sql} + GROUP BY bucket + ORDER BY bucket + ''' + with log_db: + cursor = log_db.execute(query, params_for_filter) + timeframe_data_rows = cursor.fetchall() + timeframe_data = [ + dict(bucket=r[0], count=r[1]) for r in timeframe_data_rows + ] + + # 4. User agent distribution + query = f''' + SELECT user_agent, COUNT(*) AS count + FROM file_access_log + WHERE timestamp >= ? {filetype_filter_sql} + GROUP BY user_agent + ORDER BY count DESC + ''' + with log_db: + cursor = log_db.execute(query, params_for_filter) + raw_user_agents = cursor.fetchall() + device_counts = {} + for (ua, cnt) in raw_user_agents: + device = get_device_type(ua) + device_counts[device] = device_counts.get(device, 0) + cnt + user_agent_data = [ + dict(device=d, count=c) for d, c in device_counts.items() + ] + + # 5. Parent folder distribution + query = f''' + SELECT rel_path, COUNT(*) AS count + FROM file_access_log + WHERE timestamp >= ? {filetype_filter_sql} + GROUP BY rel_path + ORDER BY count DESC + ''' + folder_data_dict = {} + with log_db: + cursor = log_db.execute(query, params_for_filter) + for (rp, c) in cursor.fetchall(): + if '/' in rp: + parent_folder = rp.rsplit('/', 1)[0] + else: + parent_folder = "Root" + folder_data_dict[parent_folder] = folder_data_dict.get(parent_folder, 0) + c + folder_data = [dict(folder=f, count=cnt) for f, cnt in folder_data_dict.items()] + folder_data.sort(key=lambda x: x['count'], reverse=True) + folder_data = folder_data[:10] + + # 6. Aggregate IP addresses with counts + query = f''' + SELECT ip_address, COUNT(*) as count + FROM file_access_log + WHERE timestamp >= ? {filetype_filter_sql} + GROUP BY ip_address + ORDER BY count DESC + ''' + with log_db: + cursor = log_db.execute(query, params_for_filter) ip_rows = cursor.fetchall() - # Summary stats using separate SQL queries - query = f'SELECT COUNT(*) FROM file_access_log WHERE timestamp >= %s {filetype_filter_sql}' - cursor.execute(query, params) + # 7. Summary stats + # total_accesses + query = f''' + SELECT COUNT(*) + FROM file_access_log + WHERE timestamp >= ? {filetype_filter_sql} + ''' + with log_db: + cursor = log_db.execute(query, params_for_filter) total_accesses = cursor.fetchone()[0] - query = f'SELECT COUNT(DISTINCT rel_path) FROM file_access_log WHERE timestamp >= %s {filetype_filter_sql}' - cursor.execute(query, params) + # unique_files + query = f''' + SELECT COUNT(DISTINCT rel_path) + FROM file_access_log + WHERE timestamp >= ? {filetype_filter_sql} + ''' + with log_db: + cursor = log_db.execute(query, params_for_filter) unique_files = cursor.fetchone()[0] - query = f'SELECT COUNT(DISTINCT device_id) FROM file_access_log WHERE timestamp >= %s {filetype_filter_sql}' - cursor.execute(query, params) + # unique_user + query = f''' + SELECT COUNT(DISTINCT device_id) + FROM file_access_log + WHERE timestamp >= ? {filetype_filter_sql} + ''' + with log_db: + cursor = log_db.execute(query, params_for_filter) unique_user = cursor.fetchone()[0] - # Process location data with GeoIP2. + # 8. Process location data with GeoIP2 reader = geoip2.database.Reader('GeoLite2-City.mmdb') - location_data = {} - for ip, count in ip_rows: - country, city = lookup_location(ip, reader) + location_data_dict = {} + for (ip_addr, cnt) in ip_rows: + country, city = lookup_location(ip_addr, reader) key = (country, city) - location_data[key] = location_data.get(key, 0) + count + location_data_dict[key] = location_data_dict.get(key, 0) + cnt reader.close() - location_data = [dict(country=key[0], city=key[1], count=value) for key, value in location_data.items()] + location_data = [ + dict(country=k[0], city=k[1], count=v) + for k, v in location_data_dict.items() + ] location_data.sort(key=lambda x: x['count'], reverse=True) location_data = location_data[:20] - return render_template("dashboard.html", - timeframe=timeframe, - rows=rows, - daily_access_data=daily_access_data, - user_agent_data=user_agent_data, - folder_data=folder_data, - location_data=location_data, - total_accesses=total_accesses, - unique_files=unique_files, - unique_user=unique_user, - timeframe_data=timeframe_data) - + # Convert the top-files rows to a list of dictionaries + # (just for consistency in passing to template). + rows = [dict(rel_path=r[0], access_count=r[1]) for r in rows] + return render_template( + "dashboard.html", + timeframe=timeframe, + rows=rows, + daily_access_data=daily_access_data, + user_agent_data=user_agent_data, + folder_data=folder_data, + location_data=location_data, + total_accesses=total_accesses, + unique_files=unique_files, + unique_user=unique_user, + timeframe_data=timeframe_data + ) diff --git a/docker-compose.yml b/docker-compose.yml index 63a065c..b55bb90 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -19,15 +19,8 @@ services: - FLASK_ENV=production - TITLE_SHORT=${TITLE_SHORT} - TITLE_LONG=${TITLE_LONG} - - DB_HOST=postgres-db - - DB_USER=${DB_USER} - - DB_PASSWORD=${DB_PASSWORD} - - DB_NAME=${DB_NAME} - depends_on: - - "postgres" networks: - traefik - - internal labels: - "traefik.enable=true" @@ -51,22 +44,7 @@ services: sh -c "pip install -r requirements.txt && gunicorn --worker-class eventlet -w 1 -b 0.0.0.0:5000 app:app" - postgres: - image: postgres:17 - restart: always - environment: - POSTGRES_USER: ${DB_USER:?} - POSTGRES_PASSWORD: ${DB_PASSWORD:?} - POSTGRES_DB: ${DB_NAME:?} - volumes: - - ./postgres_data:/var/lib/postgresql/data - networks: - internal: - aliases: - - postgres-db networks: traefik: external: true - internal: - internal: true diff --git a/requirements.txt b/requirements.txt index aad4790..4de8d00 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,3 @@ diskcache geoip2 gunicorn eventlet -psycopg2-binary