bethaus-app/analytics.py

import sqlite3
from flask import render_template, request, session
from datetime import datetime, timedelta
import geoip2.database
from auth import require_secret
import os

file_access_temp = []

# Example database name; you can change to whatever you want:
DB_NAME = 'access_log.db'

# Create a single global connection to SQLite
log_db = sqlite3.connect(DB_NAME, check_same_thread=False)

def init_log_db():
    """Create the file_access_log table if it doesn't already exist."""
    with log_db:
        log_db.execute('''
            CREATE TABLE IF NOT EXISTS file_access_log (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                timestamp TEXT,
                rel_path TEXT,
                filesize INTEGER,
                mime TEXT,
                ip_address TEXT,
                user_agent TEXT,
                device_id TEXT,
                cached BOOLEAN
            )
        ''')

init_log_db()

def lookup_location(ip, reader):
    try:
        response = reader.city(ip)
        country = response.country.name if response.country.name else "Unknown"
        city = response.city.name if response.city.name else "Unknown"
        return country, city
    except Exception:
        return "Unknown", "Unknown"

def get_device_type(user_agent):
    """Classify device type based on user agent string."""
    if 'Android' in user_agent:
        return 'Android'
    elif 'iPhone' in user_agent or 'iPad' in user_agent:
        return 'iOS'
    elif 'Windows' in user_agent:
        return 'Windows'
    elif 'Macintosh' in user_agent or 'Mac OS' in user_agent:
        return 'MacOS'
    elif 'Linux' in user_agent:
        return 'Linux'
    else:
        return 'Other'

def log_file_access(rel_path, filesize, mime, ip_address, user_agent, device_id, cached):
    """Insert a file access record into the database."""
    global file_access_temp
    timestamp = datetime.now()  # a datetime object

    # Store the ISO timestamp in the database for easy lexical comparison
    iso_ts = timestamp.isoformat()

    with log_db:
        log_db.execute('''
            INSERT INTO file_access_log
                (timestamp, rel_path, filesize, mime, ip_address, user_agent, device_id, cached)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
        ''', (iso_ts, rel_path, filesize, mime, ip_address, user_agent, device_id, cached))
    file_access_temp.insert(0, [iso_ts, rel_path, filesize, mime, ip_address, user_agent, device_id, cached])

    return iso_ts

def return_file_access():
    """Return recent file access logs from memory (the last 10 minutes)."""
    global file_access_temp
    if file_access_temp:
        cutoff_time = datetime.now() - timedelta(minutes=10)
        # Convert each stored timestamp (ISO string) back to datetime
        file_access_temp[:] = [
            entry for entry in file_access_temp
            if datetime.fromisoformat(entry[0]) >= cutoff_time
        ]
        return file_access_temp
    else:
        return []

@require_secret
def connections():
    return render_template('connections.html')

@require_secret
def dashboard():
    if 'filetype' not in session:
        session['filetype'] = 'audio'
    if 'timeframe' not in session:
        session['timeframe'] = 'last24hours'
    session['filetype'] = request.args.get('filetype', session['filetype'])
    session['timeframe'] = request.args.get('timeframe', session['timeframe'])

    now = datetime.now()

    # Determine which file type we're filtering by.
    filetype = 'other'

    # Some simplistic sets to decide how we match the MIME type
    audio_list = ['mp3', 'wav', 'audio']
    image_list = ['jpg', 'jpeg', 'image', 'photo']
    video_list = ['mp4', 'mov', 'wmv', 'avi']

    if session['filetype'].lower() in audio_list:
        filetype = 'audio/'
    elif session['filetype'].lower() in image_list:
        filetype = 'image/'
    elif session['filetype'].lower() in video_list:
        filetype = 'video/'

    # Determine start time based on session['timeframe']
    if session['timeframe'] == 'last24hours':
        start_dt = now - timedelta(hours=24)
    elif session['timeframe'] == '7days':
        start_dt = now - timedelta(days=7)
    elif session['timeframe'] == '30days':
        start_dt = now - timedelta(days=30)
    elif session['timeframe'] == '365days':
        start_dt = now - timedelta(days=365)
    else:
        start_dt = now - timedelta(hours=24)

    # We'll compare the textual timestamp (ISO 8601).
    start_str = start_dt.isoformat()

    # Build the SQL filter
    if filetype == 'other':
        # Exclude audio, image, video
        filetype_filter_sql = (
            "AND mime NOT LIKE 'audio/%' "
            "AND mime NOT LIKE 'image/%' "
            "AND mime NOT LIKE 'video/%' "
        )
        params_for_filter = (start_str,)
    else:
        # Filter for mimes that start with the given type
        filetype_filter_sql = "AND mime LIKE ?"
        params_for_filter = (start_str, filetype + '%')

    # 1. Top files by access count
    query = f'''
        SELECT rel_path, COUNT(*) as access_count
        FROM file_access_log
        WHERE timestamp >= ? {filetype_filter_sql}
        GROUP BY rel_path
        ORDER BY access_count DESC
        LIMIT 20
    '''
    with log_db:
        cursor = log_db.execute(query, params_for_filter)
        rows = cursor.fetchall()

    # 2. Distinct device trend
    #    We'll group by hour if "today", by day if "7days"/"30days", by month if "365days"
    if session['timeframe'] == 'last24hours':
        # Group by hour: substr(timestamp, 12, 2) -> HH
        query = f'''
            SELECT substr(timestamp, 1, 13) AS bucket, COUNT(DISTINCT device_id) AS count
            FROM file_access_log
            WHERE timestamp >= ? {filetype_filter_sql}
            GROUP BY bucket
            ORDER BY bucket
        '''
    elif session['timeframe'] in ('7days', '30days'):
        # Group by day: substr(timestamp, 1, 10) -> YYYY-MM-DD
        query = f'''
            SELECT substr(timestamp, 1, 10) AS bucket, COUNT(DISTINCT device_id) AS count
            FROM file_access_log
            WHERE timestamp >= ? {filetype_filter_sql}
            GROUP BY bucket
            ORDER BY bucket
        '''
    elif session['timeframe'] == '365days':
        # Group by month: substr(timestamp, 1, 7) -> YYYY-MM
        query = f'''
            SELECT substr(timestamp, 1, 7) AS bucket, COUNT(DISTINCT device_id) AS count
            FROM file_access_log
            WHERE timestamp >= ? {filetype_filter_sql}
            GROUP BY bucket
            ORDER BY bucket
        '''
    else:
        # Default: group by day
        query = f'''
            SELECT substr(timestamp, 1, 10) AS bucket, COUNT(DISTINCT device_id) AS count
            FROM file_access_log
            WHERE timestamp >= ? {filetype_filter_sql}
            GROUP BY bucket
            ORDER BY bucket
        '''
    with log_db:
        cursor = log_db.execute(query, params_for_filter)
        distinct_device_data_rows = cursor.fetchall()
    distinct_device_data = [
        dict(bucket=r[0], count=r[1]) for r in distinct_device_data_rows
    ]

    # 3. session['timeframe']-based aggregation
    #    We'll group by hour if "today", by day if "7days"/"30days", by month if "365days".
    if session['timeframe'] == 'last24hours':
        # Hour: substr(timestamp, 12, 2) -> HH
        query = f'''
            SELECT substr(timestamp, 1, 13) AS bucket, COUNT(*) AS count
            FROM file_access_log
            WHERE timestamp >= ? {filetype_filter_sql}
            GROUP BY bucket
            ORDER BY bucket
        '''
    elif session['timeframe'] in ('7days', '30days'):
        # Day: substr(timestamp, 1, 10) -> YYYY-MM-DD
        query = f'''
            SELECT substr(timestamp, 1, 10) AS bucket, COUNT(*) AS count
            FROM file_access_log
            WHERE timestamp >= ? {filetype_filter_sql}
            GROUP BY bucket
            ORDER BY bucket
        '''
    elif session['timeframe'] == '365days':
        # Month: substr(timestamp, 1, 7) -> YYYY-MM
        query = f'''
            SELECT substr(timestamp, 1, 7) AS bucket, COUNT(*) AS count
            FROM file_access_log
            WHERE timestamp >= ? {filetype_filter_sql}
            GROUP BY bucket
            ORDER BY bucket
        '''
    else:
        # Default: group by day
        query = f'''
            SELECT substr(timestamp, 1, 10) AS bucket, COUNT(*) AS count
            FROM file_access_log
            WHERE timestamp >= ? {filetype_filter_sql}
            GROUP BY bucket
            ORDER BY bucket
        '''
    with log_db:
        cursor = log_db.execute(query, params_for_filter)
        timeframe_data_rows = cursor.fetchall()
    timeframe_data = [
        dict(bucket=r[0], count=r[1]) for r in timeframe_data_rows
    ]

    # 4. User agent distribution
    query = f'''
        SELECT user_agent, COUNT(*) AS count
        FROM file_access_log
        WHERE timestamp >= ? {filetype_filter_sql}
        GROUP BY user_agent
        ORDER BY count DESC
    '''
    with log_db:
        cursor = log_db.execute(query, params_for_filter)
        raw_user_agents = cursor.fetchall()
    device_counts = {}
    for (ua, cnt) in raw_user_agents:
        device = get_device_type(ua)
        device_counts[device] = device_counts.get(device, 0) + cnt
    user_agent_data = [
        dict(device=d, count=c) for d, c in device_counts.items()
    ]

    # 5. Parent folder distribution
    query = f'''
        SELECT rel_path, COUNT(*) AS count
        FROM file_access_log
        WHERE timestamp >= ? {filetype_filter_sql}
        GROUP BY rel_path
        ORDER BY count DESC
    '''
    folder_data_dict = {}
    with log_db:
        cursor = log_db.execute(query, params_for_filter)
        for (rp, c) in cursor.fetchall():
            if '/' in rp:
                parent_folder = rp.rsplit('/', 1)[0]
            else:
                parent_folder = "Root"
            folder_data_dict[parent_folder] = folder_data_dict.get(parent_folder, 0) + c
    folder_data = [dict(folder=f, count=cnt) for f, cnt in folder_data_dict.items()]
    folder_data.sort(key=lambda x: x['count'], reverse=True)
    folder_data = folder_data[:10]

    # 6. Aggregate IP addresses with counts
    query = f'''
        SELECT ip_address, COUNT(*) as count
        FROM file_access_log
        WHERE timestamp >= ? {filetype_filter_sql}
        GROUP BY ip_address
        ORDER BY count DESC
    '''
    with log_db:
        cursor = log_db.execute(query, params_for_filter)
        ip_rows = cursor.fetchall()

    # 7. Summary stats
    # total_accesses
    query = f'''
        SELECT COUNT(*)
        FROM file_access_log
        WHERE timestamp >= ? {filetype_filter_sql}
    '''
    with log_db:
        cursor = log_db.execute(query, params_for_filter)
        total_accesses = cursor.fetchone()[0]

    # unique_files
    query = f'''
        SELECT COUNT(DISTINCT rel_path)
        FROM file_access_log
        WHERE timestamp >= ? {filetype_filter_sql}
    '''
    with log_db:
        cursor = log_db.execute(query, params_for_filter)
        unique_files = cursor.fetchone()[0]

    # unique_user
    query = f'''
        SELECT COUNT(DISTINCT device_id)
        FROM file_access_log
        WHERE timestamp >= ? {filetype_filter_sql}
    '''
    with log_db:
        cursor = log_db.execute(query, params_for_filter)
        unique_user = cursor.fetchone()[0]

    # Percentage of cached calls
    query = f'''
        SELECT (CAST(SUM(CASE WHEN cached = 1 THEN 1 ELSE 0 END) AS FLOAT) / COUNT(*)) * 100
        FROM file_access_log
        WHERE timestamp >= ? {filetype_filter_sql}
    '''
    with log_db:
        cursor = log_db.execute(query, params_for_filter)
        cached_percentage = cursor.fetchone()[0]

    if cached_percentage is not None:
        cached_percentage = f"{cached_percentage:.2f}"

    # 8. Process location data with GeoIP2
    reader = geoip2.database.Reader('GeoLite2-City.mmdb')
    location_data_dict = {}
    for (ip_addr, cnt) in ip_rows:
        country, city = lookup_location(ip_addr, reader)
        key = (country, city)
        location_data_dict[key] = location_data_dict.get(key, 0) + cnt
    reader.close()

    location_data = [
        dict(country=k[0], city=k[1], count=v)
        for k, v in location_data_dict.items()
    ]
    location_data.sort(key=lambda x: x['count'], reverse=True)
    location_data = location_data[:20]

    # Convert the top-files rows to a list of dictionaries
    rows = [dict(rel_path=r[0], access_count=r[1]) for r in rows]

    return render_template(
        "dashboard.html",
        timeframe=session['timeframe'],
        rows=rows,
        distinct_device_data=distinct_device_data,
        user_agent_data=user_agent_data,
        folder_data=folder_data,
        location_data=location_data,
        total_accesses=total_accesses,
        unique_files=unique_files,
        unique_user=unique_user,
        cached_percentage=cached_percentage,
        timeframe_data=timeframe_data
    )