initial postgres

2025-03-31 17:37:39 +00:00 · 2025-03-31 17:37:39 +00:00 · bc4aa70651
commit bc4aa70651
parent 71c0585380
5 changed files with 225 additions and 187 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,6 +4,7 @@
 /filecache_image
 /filecache_video
 /filecache_other
+/postgres_data
 /instance
 /__pycache__
 /access_log.db
--- a/analytics.py
+++ b/analytics.py
@ -1,13 +1,45 @@
 from flask import render_template, request, session
-import sqlite3
-from datetime import datetime, date, timedelta
+from datetime import datetime, timedelta
 import geoip2.database
 from urllib.parse import urlparse, unquote
-
 from auth import require_secret
+import os
+import threading
+import psycopg2

 file_access_temp = []

+# Thread-safe singleton metaclass.
+class SingletonMeta(type):
+    _instances = {}
+    _lock = threading.Lock()  # Ensures thread safety.
+
+    def __call__(cls, *args, **kwargs):
+        with cls._lock:
+            if cls not in cls._instances:
+                instance = super().__call__(*args, **kwargs)
+                cls._instances[cls] = instance
+        return cls._instances[cls]
+
+# Database class that only handles the connection.
+class Database(metaclass=SingletonMeta):
+    def __init__(self):
+        self.dbname     = os.environ.get('DB_NAME')
+        self.user       = os.environ.get('DB_USER')
+        self.password   = os.environ.get('DB_PASSWORD')
+        self.host       = os.environ.get('DB_HOST')
+        self.port       = int(os.environ.get('DB_PORT', 5432))
+        self.connection = psycopg2.connect(dbname=self.dbname,
+                                           user=self.user,
+                                           password=self.password,
+                                           host=self.host,
+                                           port=self.port)
+        # Enable autocommit so we don't have to call commit() after every transaction.
+        self.connection.autocommit = True
+
+# Create a global database instance.
+log_db = Database()
+
 def lookup_location(ip, reader):
    try:
        response = reader.city(ip)
@ -18,7 +50,7 @@ def lookup_location(ip, reader):
        return "Unknown", "Unknown"

 def get_device_type(user_agent):
-    "classify device type based on user agent string"
+    "Classify device type based on user agent string"
    if 'Android' in user_agent:
        return 'Android'
    elif 'iPhone' in user_agent or 'iPad' in user_agent:
@ -32,45 +64,37 @@ def get_device_type(user_agent):
    else:
        return 'Other'

-def log_file_access(rel_path, ip_address, user_agent, device_id):
-    """
-    Log file access details to a SQLite database.
-    Records the timestamp, full file path, client IP, user agent, and device_id.
-    """
-    global file_access_temp
-    # Connect to the database (this will create the file if it doesn't exist)
-    conn = sqlite3.connect('access_log.db')
-    cursor = conn.cursor()
-    # Create the table if it doesn't exist
+# Function to initialize the database.
+def init_log_db():
+    with log_db.connection.cursor() as cursor:
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS file_access_log (
-            id INTEGER PRIMARY KEY AUTOINCREMENT,
-            timestamp TEXT,
+                id SERIAL PRIMARY KEY,
+                timestamp TIMESTAMP,
                rel_path TEXT,
+                filesize BIGINT,
+                mime TEXT,
                ip_address TEXT,
                user_agent TEXT,
-            device_id TEXT
+                device_id TEXT,
+                cached BOOLEAN
            )
        ''')
-    # Gather information from the request
-    timestamp = datetime.now().isoformat()

-    # Insert the access record into the database
+# Logging function that uses the singleton connection.
+def log_file_access(rel_path, filesize, mime, ip_address, user_agent, device_id, cached):
+    timestamp = datetime.now()  # Use datetime object directly
+    with log_db.connection.cursor() as cursor:
        cursor.execute('''
-        INSERT INTO file_access_log (timestamp, rel_path, ip_address, user_agent, device_id)
-        VALUES (?, ?, ?, ?, ?)
-    ''', (timestamp, rel_path, ip_address, user_agent, device_id))
-    conn.commit()
-    conn.close()
-    file_access_temp.insert(0, [timestamp, rel_path, ip_address, user_agent, device_id])
-    return return_file_access()
+            INSERT INTO file_access_log (timestamp, rel_path, filesize, mime, ip_address, user_agent, device_id, cached)
+            VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
+        ''', (timestamp, rel_path, filesize, mime, ip_address, user_agent, device_id, cached))
+    return timestamp.isoformat()

 def return_file_access():
    global file_access_temp
-    if len(file_access_temp) > 0:
-        # Compute the cutoff time (10 minutes ago from now)
+    if file_access_temp:
        cutoff_time = datetime.now() - timedelta(minutes=10)
-        # Update the list in-place to keep only entries newer than 10 minutes
        file_access_temp[:] = [
            entry for entry in file_access_temp
            if datetime.fromisoformat(entry[0]) >= cutoff_time
@ -99,106 +123,96 @@ def dashboard():
    else:
        start = now.replace(hour=0, minute=0, second=0, microsecond=0)

-    conn = sqlite3.connect('access_log.db')
-    cursor = conn.cursor()
-
+    with log_db.connection.cursor() as cursor:
        # Raw file access counts for the table (top files)
        cursor.execute('''
            SELECT rel_path, COUNT(*) as access_count
            FROM file_access_log
-        WHERE timestamp >= ?
+            WHERE timestamp >= %s
            GROUP BY rel_path
            ORDER BY access_count DESC
            LIMIT 20
-    ''', (start.isoformat(),))
+        ''', (start,))
        rows = cursor.fetchall()

        # Daily access trend for a line chart
        cursor.execute('''
-        SELECT date(timestamp) as date, COUNT(*) as count
+            SELECT CAST(timestamp AS DATE) as date, COUNT(*) as count
            FROM file_access_log
-        WHERE timestamp >= ?
-        GROUP BY date
+            WHERE timestamp >= %s
+            GROUP BY CAST(timestamp AS DATE)
            ORDER BY date
-    ''', (start.isoformat(),))
-    daily_access_data = [dict(date=row[0], count=row[1]) for row in cursor.fetchall()]
+        ''', (start,))
+        daily_access_data = [dict(date=str(row[0]), count=row[1]) for row in cursor.fetchall()]

        # Aggregate download counts by time bucket according to the timeframe.
        if timeframe == 'today':
-        # Group by hour (0-23)
+            # Group by hour using to_char
            cursor.execute('''
-            SELECT strftime('%H', timestamp) as bucket, COUNT(*) as count
+                SELECT to_char(timestamp, 'HH24') as bucket, COUNT(*) as count
                FROM file_access_log
-            WHERE timestamp >= ?
+                WHERE timestamp >= %s
                GROUP BY bucket
                ORDER BY bucket
-        ''', (start.isoformat(),))
+            ''', (start,))
        elif timeframe in ('7days', '30days'):
-        # Group by day (YYYY-MM-DD)
+            # Group by day
            cursor.execute('''
-            SELECT date(timestamp) as bucket, COUNT(*) as count
+                SELECT CAST(timestamp AS DATE) as bucket, COUNT(*) as count
                FROM file_access_log
-            WHERE timestamp >= ?
+                WHERE timestamp >= %s
                GROUP BY bucket
                ORDER BY bucket
-        ''', (start.isoformat(),))
+            ''', (start,))
        elif timeframe == '365days':
-        # Group by month (YYYY-MM)
+            # Group by month using to_char
            cursor.execute('''
-            SELECT strftime('%Y-%m', timestamp) as bucket, COUNT(*) as count
+                SELECT to_char(timestamp, 'YYYY-MM') as bucket, COUNT(*) as count
                FROM file_access_log
-            WHERE timestamp >= ?
+                WHERE timestamp >= %s
                GROUP BY bucket
                ORDER BY bucket
-        ''', (start.isoformat(),))
+            ''', (start,))
        else:
            # Fallback: group by day
            cursor.execute('''
-            SELECT date(timestamp) as bucket, COUNT(*) as count
+                SELECT CAST(timestamp AS DATE) as bucket, COUNT(*) as count
                FROM file_access_log
-            WHERE timestamp >= ?
+                WHERE timestamp >= %s
                GROUP BY bucket
                ORDER BY bucket
-        ''', (start.isoformat(),))
+            ''', (start,))
        timeframe_data = [dict(bucket=row[0], count=row[1]) for row in cursor.fetchall()]

        # User agent distribution (aggregate by device type)
        cursor.execute('''
            SELECT user_agent, COUNT(*) as count
            FROM file_access_log
-        WHERE timestamp >= ?
+            WHERE timestamp >= %s
            GROUP BY user_agent
            ORDER BY count DESC
-    ''', (start.isoformat(),))
+        ''', (start,))
        raw_user_agents = [dict(user_agent=row[0], count=row[1]) for row in cursor.fetchall()]
        device_counts = {}
        for entry in raw_user_agents:
            device = get_device_type(entry['user_agent'])
            device_counts[device] = device_counts.get(device, 0) + entry['count']
-    # Rename to user_agent_data for compatibility with the frontend
        user_agent_data = [dict(device=device, count=count) for device, count in device_counts.items()]

        # Parent folder distribution
        cursor.execute('''
            SELECT rel_path, COUNT(*) as count
            FROM file_access_log
-        WHERE timestamp >= ?
+            WHERE timestamp >= %s
            GROUP BY rel_path
            ORDER BY count DESC
-    ''', (start.isoformat(),))
+        ''', (start,))
        folder_data = {}
        for row in cursor.fetchall():
            rel_path = row[0]
            parent_folder = rel_path.rsplit('/', 1)[0] if '/' in rel_path else "Root"
            folder_data[parent_folder] = folder_data.get(parent_folder, 0) + row[1]
-    
-    # Convert the dictionary to a list of dictionaries
-    folder_data = [
-        dict(folder=folder, count=count) 
-        for folder, count in folder_data.items()
-    ]
-    
-    # Sort by count in descending order and take the top 10
+        folder_data = [dict(folder=folder, count=count) for folder, count in folder_data.items()]
        folder_data.sort(key=lambda x: x['count'], reverse=True)
        folder_data = folder_data[:10]

@ -206,48 +220,35 @@ def dashboard():
        cursor.execute('''
            SELECT ip_address, COUNT(*) as count
            FROM file_access_log
-        WHERE timestamp >= ?
+            WHERE timestamp >= %s
            GROUP BY ip_address
            ORDER BY count DESC
-    ''', (start.isoformat(),))
+        ''', (start,))
        ip_rows = cursor.fetchall()

-    # Initialize GeoIP2 reader once for efficiency
+        # Summary stats using separate SQL queries
+        cursor.execute('SELECT COUNT(*) FROM file_access_log WHERE timestamp >= %s', (start,))
+        total_accesses = cursor.fetchone()[0]
+
+        cursor.execute('SELECT COUNT(DISTINCT rel_path) FROM file_access_log WHERE timestamp >= %s', (start,))
+        unique_files = cursor.fetchone()[0]
+
+        cursor.execute('SELECT COUNT(DISTINCT device_id) FROM file_access_log WHERE timestamp >= %s', (start,))
+        unique_user = cursor.fetchone()[0]
+
+    # Process location data with GeoIP2.
    reader = geoip2.database.Reader('GeoLite2-City.mmdb')
    location_data = {}
    for ip, count in ip_rows:
        country, city = lookup_location(ip, reader)
        key = (country, city)
-        if key in location_data:
-            location_data[key] += count
-        else:
-            location_data[key] = count
+        location_data[key] = location_data.get(key, 0) + count
    reader.close()

-    # Convert the dictionary to a list of dictionaries
-    location_data = [
-        dict(country=key[0], city=key[1], count=value) 
-        for key, value in location_data.items()
-    ]
-    
-    # Sort by count in descending order and take the top 20
+    location_data = [dict(country=key[0], city=key[1], count=value) for key, value in location_data.items()]
    location_data.sort(key=lambda x: x['count'], reverse=True)
    location_data = location_data[:20]

-    # Summary stats using separate SQL queries
-    cursor.execute('SELECT COUNT(*) FROM file_access_log WHERE timestamp >= ?', (start.isoformat(),))
-    total_accesses = cursor.fetchone()[0]
-    
-    # Use a separate query to count unique files (distinct rel_path values)
-    cursor.execute('SELECT COUNT(DISTINCT rel_path) FROM file_access_log WHERE timestamp >= ?', (start.isoformat(),))
-    unique_files = cursor.fetchone()[0]
-
-    # Use a separate query to count unique IP addresses
-    cursor.execute('SELECT COUNT(DISTINCT device_id) FROM file_access_log WHERE timestamp >= ?', (start.isoformat(),))
-    unique_user = cursor.fetchone()[0]
-    
-    conn.close()
-
    return render_template("dashboard.html",
                           timeframe=timeframe,
                           rows=rows,
@ -259,3 +260,6 @@ def dashboard():
                           unique_files=unique_files,
                           unique_user=unique_user,
                           timeframe_data=timeframe_data)
+
+if __name__ == '__main__':
+    init_log_db()
--- a/app.py
+++ b/app.py
@ -197,21 +197,9 @@ def serve_file(subpath):

    mime, _ = mimetypes.guess_type(full_path)
    mime = mime or 'application/octet-stream'
-
-    # logging only for mp3
-    if mime and mime.startswith('audio/mpeg'):
-        # HEAD request are coming in to initiate server caching.
-        # only log initial hits and not the reload of further file parts
    range_header = request.headers.get('Range')
-        # only request with starting from the beginning of the file will be tracked
-        #                              no range -> full file                                                   not just the first byte
-        if request.method == 'GET' and (not range_header or (range_header.startswith("bytes=0-") and range_header != "bytes=0-1")):
    ip_address = request.remote_addr
    user_agent = request.headers.get('User-Agent')
-            threading.Thread(
-                target=a.log_file_access,
-                args=(subpath, ip_address, user_agent, session['device_id'])
-            ).start()

    # Check cache first (using diskcache)
    response = None
@ -231,6 +219,7 @@ def serve_file(subpath):
    if cached:
        cached_file_bytes, mime = cached
        cached_file = io.BytesIO(cached_file_bytes)
+        filesize = len(cached_file.getbuffer())
        response = send_file(cached_file, mimetype=mime)
    else:
        if mime and mime.startswith('image/'):
@ -245,6 +234,7 @@ def serve_file(subpath):
                    save_kwargs = {'quality': 85}

                    img_bytes_io = io.BytesIO()
+                    filesize = len(img_bytes_io.getbuffer())
                    img.save(img_bytes_io, format=output_format, **save_kwargs)
                    thumb_bytes = img_bytes_io.getvalue()
                    cache.set(subpath, (thumb_bytes, output_mime))
@ -258,13 +248,32 @@ def serve_file(subpath):
                with open(full_path, 'rb') as f:
                    file_bytes = f.read()
                cache.set(subpath, (file_bytes, mime))
-                response = send_file(io.BytesIO(file_bytes), mimetype=mime, conditional=True)
+                file_bytes_io = io.BytesIO(file_bytes)
+                filesize = len(file_bytes_io.getbuffer())
+                response = send_file(file_bytes_io, mimetype=mime, conditional=True)
            except Exception as e:
                app.logger.error(f"Failed to read file {subpath}: {e}")
                abort(500)

    # Set Cache-Control header (browser caching for 1 day)
    response.headers['Cache-Control'] = 'public, max-age=86400'
+
+    if mime and mime.startswith('audio/mpeg'): # special rules for mp3 files
+        # HEAD request are coming in to initiate server caching. Ignore HEAD Request. Only log GET request.
+        # log access if there is no range header. # log access if range request starts from 0 but is larger then only from 0 to 1 (bytes=0-1)
+        if request.method == 'GET' and (not range_header or (range_header.startswith("bytes=0-") and range_header != "bytes=0-1")):
+            logging = True
+        else:
+            logging = False
+    else:
+        logging = True
+
+    if logging:
+        threading.Thread(
+            target=a.log_file_access,
+            args=(subpath, filesize, mime, ip_address, user_agent, session['device_id'], bool(cached), )
+        ).start()
+
    return response


--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,7 +1,7 @@
 services:
  flask-app:
    image: python:3.11-slim
-    container_name: "${CONTAINER_NAME}"
+    container_name: "${CONTAINER_NAME}.web"
    restart: always
    working_dir: /app
    volumes:
@ -19,8 +19,16 @@ services:
      - FLASK_ENV=production
      - TITLE_SHORT=${TITLE_SHORT}
      - TITLE_LONG=${TITLE_LONG}
+      - DB_HOST=postgres
+      - DB_PORT=5432
+      - DB_USER=${POSTGRES_USER}
+      - DB_PASSWORD=${POSTGRES_PASSWORD}
+      - DB_NAME=${POSTGRES_DB}
+    depends_on:
+      - postgres
    networks:
      - traefik
+      - internal
    labels:
      - "traefik.enable=true"

@ -44,6 +52,21 @@ services:
      sh -c "pip install -r requirements.txt &&
            gunicorn --worker-class eventlet -w 1 -b 0.0.0.0:5000 app:app"

+  postgres:
+    image: postgres:15
+    container_name: "${CONTAINER_NAME}.postgres"
+    restart: always
+    environment:
+      POSTGRES_USER: ${POSTGRES_USER}
+      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
+      POSTGRES_DB: ${POSTGRES_DB}
+    volumes:
+      - ./postgres_data:/var/lib/postgresql/data
+    networks:
+      - internal
+
 networks:
  traefik:
    external: true
+  internal:
+    internal: true
--- a/requirements.txt
+++ b/requirements.txt
@ -6,3 +6,4 @@ diskcache
 geoip2
 gunicorn
 eventlet
+psycopg2-binary