diff --git a/.gitignore b/.gitignore index 81f3168..357ccda 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ /filecache_image /filecache_video /filecache_other +/postgres_data /instance /__pycache__ /access_log.db diff --git a/analytics.py b/analytics.py index 9bd6475..93da7af 100644 --- a/analytics.py +++ b/analytics.py @@ -1,13 +1,45 @@ from flask import render_template, request, session -import sqlite3 -from datetime import datetime, date, timedelta +from datetime import datetime, timedelta import geoip2.database from urllib.parse import urlparse, unquote - from auth import require_secret +import os +import threading +import psycopg2 file_access_temp = [] +# Thread-safe singleton metaclass. +class SingletonMeta(type): + _instances = {} + _lock = threading.Lock() # Ensures thread safety. + + def __call__(cls, *args, **kwargs): + with cls._lock: + if cls not in cls._instances: + instance = super().__call__(*args, **kwargs) + cls._instances[cls] = instance + return cls._instances[cls] + +# Database class that only handles the connection. +class Database(metaclass=SingletonMeta): + def __init__(self): + self.dbname = os.environ.get('DB_NAME') + self.user = os.environ.get('DB_USER') + self.password = os.environ.get('DB_PASSWORD') + self.host = os.environ.get('DB_HOST') + self.port = int(os.environ.get('DB_PORT', 5432)) + self.connection = psycopg2.connect(dbname=self.dbname, + user=self.user, + password=self.password, + host=self.host, + port=self.port) + # Enable autocommit so we don't have to call commit() after every transaction. + self.connection.autocommit = True + +# Create a global database instance. +log_db = Database() + def lookup_location(ip, reader): try: response = reader.city(ip) @@ -18,7 +50,7 @@ def lookup_location(ip, reader): return "Unknown", "Unknown" def get_device_type(user_agent): - "classify device type based on user agent string" + "Classify device type based on user agent string" if 'Android' in user_agent: return 'Android' elif 'iPhone' in user_agent or 'iPad' in user_agent: @@ -32,47 +64,39 @@ def get_device_type(user_agent): else: return 'Other' -def log_file_access(rel_path, ip_address, user_agent, device_id): - """ - Log file access details to a SQLite database. - Records the timestamp, full file path, client IP, user agent, and device_id. - """ - global file_access_temp - # Connect to the database (this will create the file if it doesn't exist) - conn = sqlite3.connect('access_log.db') - cursor = conn.cursor() - # Create the table if it doesn't exist - cursor.execute(''' - CREATE TABLE IF NOT EXISTS file_access_log ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - timestamp TEXT, - rel_path TEXT, - ip_address TEXT, - user_agent TEXT, - device_id TEXT - ) - ''') - # Gather information from the request - timestamp = datetime.now().isoformat() - - # Insert the access record into the database - cursor.execute(''' - INSERT INTO file_access_log (timestamp, rel_path, ip_address, user_agent, device_id) - VALUES (?, ?, ?, ?, ?) - ''', (timestamp, rel_path, ip_address, user_agent, device_id)) - conn.commit() - conn.close() - file_access_temp.insert(0, [timestamp, rel_path, ip_address, user_agent, device_id]) - return return_file_access() +# Function to initialize the database. +def init_log_db(): + with log_db.connection.cursor() as cursor: + cursor.execute(''' + CREATE TABLE IF NOT EXISTS file_access_log ( + id SERIAL PRIMARY KEY, + timestamp TIMESTAMP, + rel_path TEXT, + filesize BIGINT, + mime TEXT, + ip_address TEXT, + user_agent TEXT, + device_id TEXT, + cached BOOLEAN + ) + ''') + +# Logging function that uses the singleton connection. +def log_file_access(rel_path, filesize, mime, ip_address, user_agent, device_id, cached): + timestamp = datetime.now() # Use datetime object directly + with log_db.connection.cursor() as cursor: + cursor.execute(''' + INSERT INTO file_access_log (timestamp, rel_path, filesize, mime, ip_address, user_agent, device_id, cached) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s) + ''', (timestamp, rel_path, filesize, mime, ip_address, user_agent, device_id, cached)) + return timestamp.isoformat() def return_file_access(): global file_access_temp - if len(file_access_temp) > 0: - # Compute the cutoff time (10 minutes ago from now) + if file_access_temp: cutoff_time = datetime.now() - timedelta(minutes=10) - # Update the list in-place to keep only entries newer than 10 minutes file_access_temp[:] = [ - entry for entry in file_access_temp + entry for entry in file_access_temp if datetime.fromisoformat(entry[0]) >= cutoff_time ] return file_access_temp @@ -99,155 +123,132 @@ def dashboard(): else: start = now.replace(hour=0, minute=0, second=0, microsecond=0) - conn = sqlite3.connect('access_log.db') - cursor = conn.cursor() - - # Raw file access counts for the table (top files) - cursor.execute(''' - SELECT rel_path, COUNT(*) as access_count - FROM file_access_log - WHERE timestamp >= ? - GROUP BY rel_path - ORDER BY access_count DESC - LIMIT 20 - ''', (start.isoformat(),)) - rows = cursor.fetchall() - - # Daily access trend for a line chart - cursor.execute(''' - SELECT date(timestamp) as date, COUNT(*) as count - FROM file_access_log - WHERE timestamp >= ? - GROUP BY date - ORDER BY date - ''', (start.isoformat(),)) - daily_access_data = [dict(date=row[0], count=row[1]) for row in cursor.fetchall()] - - # Aggregate download counts by time bucket according to the timeframe. - if timeframe == 'today': - # Group by hour (0-23) + with log_db.connection.cursor() as cursor: + # Raw file access counts for the table (top files) cursor.execute(''' - SELECT strftime('%H', timestamp) as bucket, COUNT(*) as count + SELECT rel_path, COUNT(*) as access_count FROM file_access_log - WHERE timestamp >= ? - GROUP BY bucket - ORDER BY bucket - ''', (start.isoformat(),)) - elif timeframe in ('7days', '30days'): - # Group by day (YYYY-MM-DD) + WHERE timestamp >= %s + GROUP BY rel_path + ORDER BY access_count DESC + LIMIT 20 + ''', (start,)) + rows = cursor.fetchall() + + # Daily access trend for a line chart cursor.execute(''' - SELECT date(timestamp) as bucket, COUNT(*) as count + SELECT CAST(timestamp AS DATE) as date, COUNT(*) as count FROM file_access_log - WHERE timestamp >= ? - GROUP BY bucket - ORDER BY bucket - ''', (start.isoformat(),)) - elif timeframe == '365days': - # Group by month (YYYY-MM) + WHERE timestamp >= %s + GROUP BY CAST(timestamp AS DATE) + ORDER BY date + ''', (start,)) + daily_access_data = [dict(date=str(row[0]), count=row[1]) for row in cursor.fetchall()] + + # Aggregate download counts by time bucket according to the timeframe. + if timeframe == 'today': + # Group by hour using to_char + cursor.execute(''' + SELECT to_char(timestamp, 'HH24') as bucket, COUNT(*) as count + FROM file_access_log + WHERE timestamp >= %s + GROUP BY bucket + ORDER BY bucket + ''', (start,)) + elif timeframe in ('7days', '30days'): + # Group by day + cursor.execute(''' + SELECT CAST(timestamp AS DATE) as bucket, COUNT(*) as count + FROM file_access_log + WHERE timestamp >= %s + GROUP BY bucket + ORDER BY bucket + ''', (start,)) + elif timeframe == '365days': + # Group by month using to_char + cursor.execute(''' + SELECT to_char(timestamp, 'YYYY-MM') as bucket, COUNT(*) as count + FROM file_access_log + WHERE timestamp >= %s + GROUP BY bucket + ORDER BY bucket + ''', (start,)) + else: + # Fallback: group by day + cursor.execute(''' + SELECT CAST(timestamp AS DATE) as bucket, COUNT(*) as count + FROM file_access_log + WHERE timestamp >= %s + GROUP BY bucket + ORDER BY bucket + ''', (start,)) + timeframe_data = [dict(bucket=row[0], count=row[1]) for row in cursor.fetchall()] + + # User agent distribution (aggregate by device type) cursor.execute(''' - SELECT strftime('%Y-%m', timestamp) as bucket, COUNT(*) as count + SELECT user_agent, COUNT(*) as count FROM file_access_log - WHERE timestamp >= ? - GROUP BY bucket - ORDER BY bucket - ''', (start.isoformat(),)) - else: - # Fallback: group by day + WHERE timestamp >= %s + GROUP BY user_agent + ORDER BY count DESC + ''', (start,)) + raw_user_agents = [dict(user_agent=row[0], count=row[1]) for row in cursor.fetchall()] + device_counts = {} + for entry in raw_user_agents: + device = get_device_type(entry['user_agent']) + device_counts[device] = device_counts.get(device, 0) + entry['count'] + user_agent_data = [dict(device=device, count=count) for device, count in device_counts.items()] + + # Parent folder distribution cursor.execute(''' - SELECT date(timestamp) as bucket, COUNT(*) as count + SELECT rel_path, COUNT(*) as count FROM file_access_log - WHERE timestamp >= ? - GROUP BY bucket - ORDER BY bucket - ''', (start.isoformat(),)) - timeframe_data = [dict(bucket=row[0], count=row[1]) for row in cursor.fetchall()] + WHERE timestamp >= %s + GROUP BY rel_path + ORDER BY count DESC + ''', (start,)) + folder_data = {} + for row in cursor.fetchall(): + rel_path = row[0] + parent_folder = rel_path.rsplit('/', 1)[0] if '/' in rel_path else "Root" + folder_data[parent_folder] = folder_data.get(parent_folder, 0) + row[1] + folder_data = [dict(folder=folder, count=count) for folder, count in folder_data.items()] + folder_data.sort(key=lambda x: x['count'], reverse=True) + folder_data = folder_data[:10] - # User agent distribution (aggregate by device type) - cursor.execute(''' - SELECT user_agent, COUNT(*) as count - FROM file_access_log - WHERE timestamp >= ? - GROUP BY user_agent - ORDER BY count DESC - ''', (start.isoformat(),)) - raw_user_agents = [dict(user_agent=row[0], count=row[1]) for row in cursor.fetchall()] - device_counts = {} - for entry in raw_user_agents: - device = get_device_type(entry['user_agent']) - device_counts[device] = device_counts.get(device, 0) + entry['count'] - # Rename to user_agent_data for compatibility with the frontend - user_agent_data = [dict(device=device, count=count) for device, count in device_counts.items()] + # Aggregate IP addresses with counts + cursor.execute(''' + SELECT ip_address, COUNT(*) as count + FROM file_access_log + WHERE timestamp >= %s + GROUP BY ip_address + ORDER BY count DESC + ''', (start,)) + ip_rows = cursor.fetchall() - # Parent folder distribution - cursor.execute(''' - SELECT rel_path, COUNT(*) as count - FROM file_access_log - WHERE timestamp >= ? - GROUP BY rel_path - ORDER BY count DESC - ''', (start.isoformat(),)) - folder_data = {} - for row in cursor.fetchall(): - rel_path = row[0] - parent_folder = rel_path.rsplit('/', 1)[0] if '/' in rel_path else "Root" - folder_data[parent_folder] = folder_data.get(parent_folder, 0) + row[1] - - # Convert the dictionary to a list of dictionaries - folder_data = [ - dict(folder=folder, count=count) - for folder, count in folder_data.items() - ] - - # Sort by count in descending order and take the top 10 - folder_data.sort(key=lambda x: x['count'], reverse=True) - folder_data = folder_data[:10] + # Summary stats using separate SQL queries + cursor.execute('SELECT COUNT(*) FROM file_access_log WHERE timestamp >= %s', (start,)) + total_accesses = cursor.fetchone()[0] - # Aggregate IP addresses with counts - cursor.execute(''' - SELECT ip_address, COUNT(*) as count - FROM file_access_log - WHERE timestamp >= ? - GROUP BY ip_address - ORDER BY count DESC - ''', (start.isoformat(),)) - ip_rows = cursor.fetchall() + cursor.execute('SELECT COUNT(DISTINCT rel_path) FROM file_access_log WHERE timestamp >= %s', (start,)) + unique_files = cursor.fetchone()[0] - # Initialize GeoIP2 reader once for efficiency + cursor.execute('SELECT COUNT(DISTINCT device_id) FROM file_access_log WHERE timestamp >= %s', (start,)) + unique_user = cursor.fetchone()[0] + + # Process location data with GeoIP2. reader = geoip2.database.Reader('GeoLite2-City.mmdb') location_data = {} for ip, count in ip_rows: country, city = lookup_location(ip, reader) key = (country, city) - if key in location_data: - location_data[key] += count - else: - location_data[key] = count + location_data[key] = location_data.get(key, 0) + count reader.close() - - # Convert the dictionary to a list of dictionaries - location_data = [ - dict(country=key[0], city=key[1], count=value) - for key, value in location_data.items() - ] - - # Sort by count in descending order and take the top 20 + + location_data = [dict(country=key[0], city=key[1], count=value) for key, value in location_data.items()] location_data.sort(key=lambda x: x['count'], reverse=True) location_data = location_data[:20] - # Summary stats using separate SQL queries - cursor.execute('SELECT COUNT(*) FROM file_access_log WHERE timestamp >= ?', (start.isoformat(),)) - total_accesses = cursor.fetchone()[0] - - # Use a separate query to count unique files (distinct rel_path values) - cursor.execute('SELECT COUNT(DISTINCT rel_path) FROM file_access_log WHERE timestamp >= ?', (start.isoformat(),)) - unique_files = cursor.fetchone()[0] - - # Use a separate query to count unique IP addresses - cursor.execute('SELECT COUNT(DISTINCT device_id) FROM file_access_log WHERE timestamp >= ?', (start.isoformat(),)) - unique_user = cursor.fetchone()[0] - - conn.close() - return render_template("dashboard.html", timeframe=timeframe, rows=rows, @@ -258,4 +259,7 @@ def dashboard(): total_accesses=total_accesses, unique_files=unique_files, unique_user=unique_user, - timeframe_data=timeframe_data) \ No newline at end of file + timeframe_data=timeframe_data) + +if __name__ == '__main__': + init_log_db() diff --git a/app.py b/app.py index e0b8e98..fee0c9f 100755 --- a/app.py +++ b/app.py @@ -186,7 +186,7 @@ def api_browse(subpath): @app.route("/media/") @auth.require_secret -def serve_file(subpath): +def serve_file(subpath): root, *relative_parts = subpath.split('/') base_path = session['folders'][root] full_path = os.path.join(base_path, *relative_parts) @@ -197,21 +197,9 @@ def serve_file(subpath): mime, _ = mimetypes.guess_type(full_path) mime = mime or 'application/octet-stream' - - # logging only for mp3 - if mime and mime.startswith('audio/mpeg'): - # HEAD request are coming in to initiate server caching. - # only log initial hits and not the reload of further file parts - range_header = request.headers.get('Range') - # only request with starting from the beginning of the file will be tracked - # no range -> full file not just the first byte - if request.method == 'GET' and (not range_header or (range_header.startswith("bytes=0-") and range_header != "bytes=0-1")): - ip_address = request.remote_addr - user_agent = request.headers.get('User-Agent') - threading.Thread( - target=a.log_file_access, - args=(subpath, ip_address, user_agent, session['device_id']) - ).start() + range_header = request.headers.get('Range') + ip_address = request.remote_addr + user_agent = request.headers.get('User-Agent') # Check cache first (using diskcache) response = None @@ -231,6 +219,7 @@ def serve_file(subpath): if cached: cached_file_bytes, mime = cached cached_file = io.BytesIO(cached_file_bytes) + filesize = len(cached_file.getbuffer()) response = send_file(cached_file, mimetype=mime) else: if mime and mime.startswith('image/'): @@ -245,6 +234,7 @@ def serve_file(subpath): save_kwargs = {'quality': 85} img_bytes_io = io.BytesIO() + filesize = len(img_bytes_io.getbuffer()) img.save(img_bytes_io, format=output_format, **save_kwargs) thumb_bytes = img_bytes_io.getvalue() cache.set(subpath, (thumb_bytes, output_mime)) @@ -258,13 +248,32 @@ def serve_file(subpath): with open(full_path, 'rb') as f: file_bytes = f.read() cache.set(subpath, (file_bytes, mime)) - response = send_file(io.BytesIO(file_bytes), mimetype=mime, conditional=True) + file_bytes_io = io.BytesIO(file_bytes) + filesize = len(file_bytes_io.getbuffer()) + response = send_file(file_bytes_io, mimetype=mime, conditional=True) except Exception as e: app.logger.error(f"Failed to read file {subpath}: {e}") abort(500) # Set Cache-Control header (browser caching for 1 day) response.headers['Cache-Control'] = 'public, max-age=86400' + + if mime and mime.startswith('audio/mpeg'): # special rules for mp3 files + # HEAD request are coming in to initiate server caching. Ignore HEAD Request. Only log GET request. + # log access if there is no range header. # log access if range request starts from 0 but is larger then only from 0 to 1 (bytes=0-1) + if request.method == 'GET' and (not range_header or (range_header.startswith("bytes=0-") and range_header != "bytes=0-1")): + logging = True + else: + logging = False + else: + logging = True + + if logging: + threading.Thread( + target=a.log_file_access, + args=(subpath, filesize, mime, ip_address, user_agent, session['device_id'], bool(cached), ) + ).start() + return response diff --git a/docker-compose.yml b/docker-compose.yml index b7c98ee..828364d 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,7 +1,7 @@ services: flask-app: image: python:3.11-slim - container_name: "${CONTAINER_NAME}" + container_name: "${CONTAINER_NAME}.web" restart: always working_dir: /app volumes: @@ -19,8 +19,16 @@ services: - FLASK_ENV=production - TITLE_SHORT=${TITLE_SHORT} - TITLE_LONG=${TITLE_LONG} + - DB_HOST=postgres + - DB_PORT=5432 + - DB_USER=${POSTGRES_USER} + - DB_PASSWORD=${POSTGRES_PASSWORD} + - DB_NAME=${POSTGRES_DB} + depends_on: + - postgres networks: - traefik + - internal labels: - "traefik.enable=true" @@ -44,6 +52,21 @@ services: sh -c "pip install -r requirements.txt && gunicorn --worker-class eventlet -w 1 -b 0.0.0.0:5000 app:app" + postgres: + image: postgres:15 + container_name: "${CONTAINER_NAME}.postgres" + restart: always + environment: + POSTGRES_USER: ${POSTGRES_USER} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + POSTGRES_DB: ${POSTGRES_DB} + volumes: + - ./postgres_data:/var/lib/postgresql/data + networks: + - internal + networks: traefik: external: true + internal: + internal: true diff --git a/requirements.txt b/requirements.txt index 4de8d00..aad4790 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ diskcache geoip2 gunicorn eventlet +psycopg2-binary