bethaus-app/analytics.py

from flask import render_template, request
from datetime import datetime, timedelta
import geoip2.database
from auth import require_secret
import os
import psycopg2

file_access_temp = []

# singleton metaclass.
class SingletonMeta(type):
    _instances = {}
    def __call__(cls, *args, **kwargs):
        if cls not in cls._instances:
            instance = super().__call__(*args, **kwargs)
            cls._instances[cls] = instance
        return cls._instances[cls]

# Database class that only handles the connection.
class Database(metaclass=SingletonMeta):
    def __init__(self):
        self.dbname     = os.environ.get('DB_NAME')
        self.user       = os.environ.get('DB_USER')
        self.password   = os.environ.get('DB_PASSWORD')
        self.host       = os.environ.get('DB_HOST')
        self.port       = int(os.environ.get('DB_PORT', 5432))

        self.connection = psycopg2.connect(dbname=self.dbname,
                                           user=self.user,
                                           password=self.password,
                                           host=self.host,
                                           port=self.port)
        # Enable autocommit
        self.connection.autocommit = True

        self.init_log_db()

    # Function to initialize the database.
    def init_log_db(self):
        with self.connection.cursor() as cursor:
            cursor.execute('''
                CREATE TABLE IF NOT EXISTS file_access_log (
                    id SERIAL PRIMARY KEY,
                    timestamp TIMESTAMP,
                    rel_path TEXT,
                    filesize BIGINT,
                    mime TEXT,
                    ip_address TEXT,
                    user_agent TEXT,
                    device_id TEXT,
                    cached BOOLEAN
                )
            ''')

log_db = Database()


def lookup_location(ip, reader):
    try:
        response = reader.city(ip)
        country = response.country.name if response.country.name else "Unknown"
        city = response.city.name if response.city.name else "Unknown"
        return country, city
    except Exception:
        return "Unknown", "Unknown"

def get_device_type(user_agent):
    "Classify device type based on user agent string"
    if 'Android' in user_agent:
        return 'Android'
    elif 'iPhone' in user_agent or 'iPad' in user_agent:
        return 'iOS'
    elif 'Windows' in user_agent:
        return 'Windows'
    elif 'Macintosh' in user_agent or 'Mac OS' in user_agent:
        return 'MacOS'
    elif 'Linux' in user_agent:
        return 'Linux'
    else:
        return 'Other'

# Logging function that uses the singleton connection.
def log_file_access(rel_path, filesize, mime, ip_address, user_agent, device_id, cached):
    global file_access_temp
    timestamp = datetime.now()  # Use datetime object directly
    with log_db.connection.cursor() as cursor:
        cursor.execute('''
            INSERT INTO file_access_log (timestamp, rel_path, filesize, mime, ip_address, user_agent, device_id, cached)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
        ''', (timestamp, rel_path, filesize, mime, ip_address, user_agent, device_id, cached))
    file_access_temp.insert(0, [timestamp, rel_path, filesize, mime, ip_address, user_agent, device_id, cached])
    return timestamp.isoformat()

def return_file_access():
    global file_access_temp
    if file_access_temp:
        cutoff_time = datetime.now() - timedelta(minutes=10)
        file_access_temp[:] = [
            entry for entry in file_access_temp
            if datetime.fromisoformat(entry[0]) >= cutoff_time
        ]
        return file_access_temp
    else:
        return []

@require_secret
def connections():
    return render_template('connections.html')

@require_secret
def dashboard():
    filetype_arg = request.args.get('filetype', 'audio')
    timeframe = request.args.get('timeframe', 'today')
    now = datetime.now()

    # Determine which file type we're filtering by.
    filetype = 'other'

    allowed_list = ['mp3', 'wav', 'audio']
    if filetype_arg.lower() in allowed_list:
        filetype = 'audio/'

    allowed_list = ['jpg', 'jpeg', 'image', 'photo']
    if filetype_arg.lower() in allowed_list:
        filetype = 'image/'

    allowed_list = ['mp4', 'mov', 'wmv', 'avi']
    if filetype_arg.lower() in allowed_list:
        filetype = 'video/'


    # Determine the start time based on timeframe.
    if timeframe == 'today':
        start = now.replace(hour=0, minute=0, second=0, microsecond=0)
    elif timeframe == '7days':
        start = now - timedelta(days=7)
    elif timeframe == '30days':
        start = now - timedelta(days=30)
    elif timeframe == '365days':
        start = now - timedelta(days=365)
    else:
        start = now.replace(hour=0, minute=0, second=0, microsecond=0)

    # Build the SQL filter for mime
    if filetype == 'other':
        # Exclude audio, image, and video mimes
        filetype_filter_sql = "AND mime NOT LIKE 'audio/%' AND mime NOT LIKE 'image/%' AND mime NOT LIKE 'video/%'"
        params = (start,)
    else:
        # Filter for mimes that start with the given type.
        filetype_filter_sql = "AND mime LIKE %s"
        params = (start, filetype + '%')

    with log_db.connection.cursor() as cursor:
        # Raw file access counts (top files)
        query = f'''
            SELECT rel_path, COUNT(*) as access_count
            FROM file_access_log
            WHERE timestamp >= %s {filetype_filter_sql}
            GROUP BY rel_path
            ORDER BY access_count DESC
            LIMIT 20
        '''
        cursor.execute(query, params)
        rows = cursor.fetchall()

        # Daily access trend for a line chart
        query = f'''
            SELECT CAST(timestamp AS DATE) as date, COUNT(*) as count
            FROM file_access_log
            WHERE timestamp >= %s {filetype_filter_sql}
            GROUP BY CAST(timestamp AS DATE)
            ORDER BY date
        '''
        cursor.execute(query, params)
        daily_access_data = [dict(date=str(row[0]), count=row[1]) for row in cursor.fetchall()]

        # Aggregate download counts by time bucket according to the timeframe.
        if timeframe == 'today':
            query = f'''
                SELECT to_char(timestamp, 'HH24') as bucket, COUNT(*) as count
                FROM file_access_log
                WHERE timestamp >= %s {filetype_filter_sql}
                GROUP BY bucket
                ORDER BY bucket
            '''
            cursor.execute(query, params)
        elif timeframe in ('7days', '30days'):
            query = f'''
                SELECT CAST(timestamp AS DATE) as bucket, COUNT(*) as count
                FROM file_access_log
                WHERE timestamp >= %s {filetype_filter_sql}
                GROUP BY bucket
                ORDER BY bucket
            '''
            cursor.execute(query, params)
        elif timeframe == '365days':
            query = f'''
                SELECT to_char(timestamp, 'YYYY-MM') as bucket, COUNT(*) as count
                FROM file_access_log
                WHERE timestamp >= %s {filetype_filter_sql}
                GROUP BY bucket
                ORDER BY bucket
            '''
            cursor.execute(query, params)
        else:
            query = f'''
                SELECT CAST(timestamp AS DATE) as bucket, COUNT(*) as count
                FROM file_access_log
                WHERE timestamp >= %s {filetype_filter_sql}
                GROUP BY bucket
                ORDER BY bucket
            '''
            cursor.execute(query, params)
        timeframe_data = [dict(bucket=row[0], count=row[1]) for row in cursor.fetchall()]

        # User agent distribution (aggregate by device type)
        query = f'''
            SELECT user_agent, COUNT(*) as count
            FROM file_access_log
            WHERE timestamp >= %s {filetype_filter_sql}
            GROUP BY user_agent
            ORDER BY count DESC
        '''
        cursor.execute(query, params)
        raw_user_agents = [dict(user_agent=row[0], count=row[1]) for row in cursor.fetchall()]
        device_counts = {}
        for entry in raw_user_agents:
            device = get_device_type(entry['user_agent'])
            device_counts[device] = device_counts.get(device, 0) + entry['count']
        user_agent_data = [dict(device=device, count=count) for device, count in device_counts.items()]

        # Parent folder distribution
        query = f'''
            SELECT rel_path, COUNT(*) as count
            FROM file_access_log
            WHERE timestamp >= %s {filetype_filter_sql}
            GROUP BY rel_path
            ORDER BY count DESC
        '''
        cursor.execute(query, params)
        folder_data = {}
        for row in cursor.fetchall():
            rel_path = row[0]
            parent_folder = rel_path.rsplit('/', 1)[0] if '/' in rel_path else "Root"
            folder_data[parent_folder] = folder_data.get(parent_folder, 0) + row[1]
        folder_data = [dict(folder=folder, count=count) for folder, count in folder_data.items()]
        folder_data.sort(key=lambda x: x['count'], reverse=True)
        folder_data = folder_data[:10]

        # Aggregate IP addresses with counts
        query = f'''
            SELECT ip_address, COUNT(*) as count
            FROM file_access_log
            WHERE timestamp >= %s {filetype_filter_sql}
            GROUP BY ip_address
            ORDER BY count DESC
        '''
        cursor.execute(query, params)
        ip_rows = cursor.fetchall()

        # Summary stats using separate SQL queries
        query = f'SELECT COUNT(*) FROM file_access_log WHERE timestamp >= %s {filetype_filter_sql}'
        cursor.execute(query, params)
        total_accesses = cursor.fetchone()[0]

        query = f'SELECT COUNT(DISTINCT rel_path) FROM file_access_log WHERE timestamp >= %s {filetype_filter_sql}'
        cursor.execute(query, params)
        unique_files = cursor.fetchone()[0]

        query = f'SELECT COUNT(DISTINCT device_id) FROM file_access_log WHERE timestamp >= %s {filetype_filter_sql}'
        cursor.execute(query, params)
        unique_user = cursor.fetchone()[0]

    # Process location data with GeoIP2.
    reader = geoip2.database.Reader('GeoLite2-City.mmdb')
    location_data = {}
    for ip, count in ip_rows:
        country, city = lookup_location(ip, reader)
        key = (country, city)
        location_data[key] = location_data.get(key, 0) + count
    reader.close()

    location_data = [dict(country=key[0], city=key[1], count=value) for key, value in location_data.items()]
    location_data.sort(key=lambda x: x['count'], reverse=True)
    location_data = location_data[:20]

    return render_template("dashboard.html",
                           timeframe=timeframe,
                           rows=rows,
                           daily_access_data=daily_access_data,
                           user_agent_data=user_agent_data,
                           folder_data=folder_data,
                           location_data=location_data,
                           total_accesses=total_accesses,
                           unique_files=unique_files,
                           unique_user=unique_user,
                           timeframe_data=timeframe_data)