back to sqlite

This commit is contained in:
lelo 2025-03-31 22:06:32 +00:00
parent 4db37c49ff
commit 76eca80a4a
3 changed files with 220 additions and 199 deletions

View File

@ -1,37 +1,27 @@
import sqlite3
from flask import render_template, request
from datetime import datetime, timedelta
import geoip2.database
from auth import require_secret
import os
import psycopg2
file_access_temp = []
dbname = os.environ.get('DB_NAME')
user = os.environ.get('DB_USER')
password = os.environ.get('DB_PASSWORD')
host = os.environ.get('DB_HOST')
port = int(os.environ.get('DB_PORT', 5432))
# Example database name; you can change to whatever you want:
DB_NAME = 'access_log.db'
connection = psycopg2.connect(dbname=dbname,
user=user,
password=password,
host=host,
port=port
)
# Enable autocommit
connection.autocommit = True
log_db = connection
# Create a single global connection to SQLite
log_db = sqlite3.connect(DB_NAME, check_same_thread=False)
# Function to initialize the database.
def init_log_db():
with log_db.cursor() as cursor:
cursor.execute('''
"""Create the file_access_log table if it doesn't already exist."""
with log_db:
log_db.execute('''
CREATE TABLE IF NOT EXISTS file_access_log (
id SERIAL PRIMARY KEY,
timestamp TIMESTAMP,
id INTEGER PRIMARY KEY AUTOINCREMENT,
timestamp TEXT,
rel_path TEXT,
filesize BIGINT,
filesize INTEGER,
mime TEXT,
ip_address TEXT,
user_agent TEXT,
@ -42,8 +32,6 @@ def init_log_db():
init_log_db()
def lookup_location(ip, reader):
try:
response = reader.city(ip)
@ -54,7 +42,7 @@ def lookup_location(ip, reader):
return "Unknown", "Unknown"
def get_device_type(user_agent):
"Classify device type based on user agent string"
"""Classify device type based on user agent string."""
if 'Android' in user_agent:
return 'Android'
elif 'iPhone' in user_agent or 'iPad' in user_agent:
@ -68,22 +56,30 @@ def get_device_type(user_agent):
else:
return 'Other'
# Logging function that uses the singleton connection.
def log_file_access(rel_path, filesize, mime, ip_address, user_agent, device_id, cached):
"""Insert a file access record into the database."""
global file_access_temp
timestamp = datetime.now() # Use datetime object directly
with log_db.connection.cursor() as cursor:
cursor.execute('''
INSERT INTO file_access_log (timestamp, rel_path, filesize, mime, ip_address, user_agent, device_id, cached)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
''', (timestamp, rel_path, filesize, mime, ip_address, user_agent, device_id, cached))
file_access_temp.insert(0, [timestamp.isoformat(), rel_path, filesize, mime, ip_address, user_agent, device_id, cached])
return timestamp.isoformat()
timestamp = datetime.now() # a datetime object
# Store the ISO timestamp in the database for easy lexical comparison
iso_ts = timestamp.isoformat()
with log_db:
log_db.execute('''
INSERT INTO file_access_log
(timestamp, rel_path, filesize, mime, ip_address, user_agent, device_id, cached)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
''', (iso_ts, rel_path, filesize, mime, ip_address, user_agent, device_id, cached))
file_access_temp.insert(0, [iso_ts, rel_path, filesize, mime, ip_address, user_agent, device_id, cached])
return iso_ts
def return_file_access():
"""Return recent file access logs from memory (the last 10 minutes)."""
global file_access_temp
if file_access_temp:
cutoff_time = datetime.now() - timedelta(minutes=10)
# Convert each stored timestamp (ISO string) back to datetime
file_access_temp[:] = [
entry for entry in file_access_temp
if datetime.fromisoformat(entry[0]) >= cutoff_time
@ -105,186 +101,234 @@ def dashboard():
# Determine which file type we're filtering by.
filetype = 'other'
allowed_list = ['mp3', 'wav', 'audio']
if filetype_arg.lower() in allowed_list:
# Some simplistic sets to decide how we match the MIME type
audio_list = ['mp3', 'wav', 'audio']
image_list = ['jpg', 'jpeg', 'image', 'photo']
video_list = ['mp4', 'mov', 'wmv', 'avi']
if filetype_arg.lower() in audio_list:
filetype = 'audio/'
allowed_list = ['jpg', 'jpeg', 'image', 'photo']
if filetype_arg.lower() in allowed_list:
elif filetype_arg.lower() in image_list:
filetype = 'image/'
allowed_list = ['mp4', 'mov', 'wmv', 'avi']
if filetype_arg.lower() in allowed_list:
elif filetype_arg.lower() in video_list:
filetype = 'video/'
# Determine the start time based on timeframe.
# Determine start time based on timeframe
if timeframe == 'today':
start = now.replace(hour=0, minute=0, second=0, microsecond=0)
start_dt = now.replace(hour=0, minute=0, second=0, microsecond=0)
elif timeframe == '7days':
start = now - timedelta(days=7)
start_dt = now - timedelta(days=7)
elif timeframe == '30days':
start = now - timedelta(days=30)
start_dt = now - timedelta(days=30)
elif timeframe == '365days':
start = now - timedelta(days=365)
start_dt = now - timedelta(days=365)
else:
start = now.replace(hour=0, minute=0, second=0, microsecond=0)
start_dt = now.replace(hour=0, minute=0, second=0, microsecond=0)
# Build the SQL filter for mime
# We'll compare the textual timestamp (ISO 8601).
start_str = start_dt.isoformat()
# Build the SQL filter
if filetype == 'other':
# Exclude audio, image, and video mimes
filetype_filter_sql = "AND mime NOT LIKE 'audio/%' AND mime NOT LIKE 'image/%' AND mime NOT LIKE 'video/%'"
params = (start,)
# Exclude audio, image, video
filetype_filter_sql = (
"AND mime NOT LIKE 'audio/%' "
"AND mime NOT LIKE 'image/%' "
"AND mime NOT LIKE 'video/%' "
)
params_for_filter = (start_str,)
else:
# Filter for mimes that start with the given type.
filetype_filter_sql = "AND mime LIKE %s"
params = (start, filetype + '%')
# Filter for mimes that start with the given type
filetype_filter_sql = "AND mime LIKE ?"
params_for_filter = (start_str, filetype + '%')
with log_db.connection.cursor() as cursor:
# Raw file access counts (top files)
query = f'''
SELECT rel_path, COUNT(*) as access_count
FROM file_access_log
WHERE timestamp >= %s {filetype_filter_sql}
GROUP BY rel_path
ORDER BY access_count DESC
LIMIT 20
'''
cursor.execute(query, params)
# 1. Top files by access count
query = f'''
SELECT rel_path, COUNT(*) as access_count
FROM file_access_log
WHERE timestamp >= ? {filetype_filter_sql}
GROUP BY rel_path
ORDER BY access_count DESC
LIMIT 20
'''
with log_db:
cursor = log_db.execute(query, params_for_filter)
rows = cursor.fetchall()
# Daily access trend for a line chart
query = f'''
SELECT CAST(timestamp AS DATE) as date, COUNT(*) as count
FROM file_access_log
WHERE timestamp >= %s {filetype_filter_sql}
GROUP BY CAST(timestamp AS DATE)
ORDER BY date
'''
cursor.execute(query, params)
daily_access_data = [dict(date=str(row[0]), count=row[1]) for row in cursor.fetchall()]
# 2. Daily access trend (line chart)
# We'll group by day using substr(timestamp, 1, 10) -> YYYY-MM-DD
query = f'''
SELECT substr(timestamp, 1, 10) AS date, COUNT(*) AS count
FROM file_access_log
WHERE timestamp >= ? {filetype_filter_sql}
GROUP BY date
ORDER BY date
'''
with log_db:
cursor = log_db.execute(query, params_for_filter)
daily_rows = cursor.fetchall()
daily_access_data = [
dict(date=r[0], count=r[1]) for r in daily_rows
]
# Aggregate download counts by time bucket according to the timeframe.
if timeframe == 'today':
query = f'''
SELECT to_char(timestamp, 'HH24') as bucket, COUNT(*) as count
FROM file_access_log
WHERE timestamp >= %s {filetype_filter_sql}
GROUP BY bucket
ORDER BY bucket
'''
cursor.execute(query, params)
elif timeframe in ('7days', '30days'):
query = f'''
SELECT CAST(timestamp AS DATE) as bucket, COUNT(*) as count
FROM file_access_log
WHERE timestamp >= %s {filetype_filter_sql}
GROUP BY bucket
ORDER BY bucket
'''
cursor.execute(query, params)
elif timeframe == '365days':
query = f'''
SELECT to_char(timestamp, 'YYYY-MM') as bucket, COUNT(*) as count
FROM file_access_log
WHERE timestamp >= %s {filetype_filter_sql}
GROUP BY bucket
ORDER BY bucket
'''
cursor.execute(query, params)
else:
query = f'''
SELECT CAST(timestamp AS DATE) as bucket, COUNT(*) as count
FROM file_access_log
WHERE timestamp >= %s {filetype_filter_sql}
GROUP BY bucket
ORDER BY bucket
'''
cursor.execute(query, params)
timeframe_data = [dict(bucket=row[0], count=row[1]) for row in cursor.fetchall()]
# User agent distribution (aggregate by device type)
# 3. Timeframe-based aggregation
# We'll group by hour if "today", by day if "7days"/"30days", by month if "365days".
if timeframe == 'today':
# Hour: substr(timestamp, 12, 2) -> HH
query = f'''
SELECT user_agent, COUNT(*) as count
SELECT substr(timestamp, 12, 2) AS bucket, COUNT(*) AS count
FROM file_access_log
WHERE timestamp >= %s {filetype_filter_sql}
GROUP BY user_agent
ORDER BY count DESC
WHERE timestamp >= ? {filetype_filter_sql}
GROUP BY bucket
ORDER BY bucket
'''
cursor.execute(query, params)
raw_user_agents = [dict(user_agent=row[0], count=row[1]) for row in cursor.fetchall()]
device_counts = {}
for entry in raw_user_agents:
device = get_device_type(entry['user_agent'])
device_counts[device] = device_counts.get(device, 0) + entry['count']
user_agent_data = [dict(device=device, count=count) for device, count in device_counts.items()]
# Parent folder distribution
elif timeframe in ('7days', '30days'):
# Day: substr(timestamp, 1, 10) -> YYYY-MM-DD
query = f'''
SELECT rel_path, COUNT(*) as count
SELECT substr(timestamp, 1, 10) AS bucket, COUNT(*) AS count
FROM file_access_log
WHERE timestamp >= %s {filetype_filter_sql}
GROUP BY rel_path
ORDER BY count DESC
WHERE timestamp >= ? {filetype_filter_sql}
GROUP BY bucket
ORDER BY bucket
'''
cursor.execute(query, params)
folder_data = {}
for row in cursor.fetchall():
rel_path = row[0]
parent_folder = rel_path.rsplit('/', 1)[0] if '/' in rel_path else "Root"
folder_data[parent_folder] = folder_data.get(parent_folder, 0) + row[1]
folder_data = [dict(folder=folder, count=count) for folder, count in folder_data.items()]
folder_data.sort(key=lambda x: x['count'], reverse=True)
folder_data = folder_data[:10]
# Aggregate IP addresses with counts
elif timeframe == '365days':
# Month: substr(timestamp, 1, 7) -> YYYY-MM
query = f'''
SELECT ip_address, COUNT(*) as count
SELECT substr(timestamp, 1, 7) AS bucket, COUNT(*) AS count
FROM file_access_log
WHERE timestamp >= %s {filetype_filter_sql}
GROUP BY ip_address
ORDER BY count DESC
WHERE timestamp >= ? {filetype_filter_sql}
GROUP BY bucket
ORDER BY bucket
'''
cursor.execute(query, params)
else:
# Default: group by day
query = f'''
SELECT substr(timestamp, 1, 10) AS bucket, COUNT(*) AS count
FROM file_access_log
WHERE timestamp >= ? {filetype_filter_sql}
GROUP BY bucket
ORDER BY bucket
'''
with log_db:
cursor = log_db.execute(query, params_for_filter)
timeframe_data_rows = cursor.fetchall()
timeframe_data = [
dict(bucket=r[0], count=r[1]) for r in timeframe_data_rows
]
# 4. User agent distribution
query = f'''
SELECT user_agent, COUNT(*) AS count
FROM file_access_log
WHERE timestamp >= ? {filetype_filter_sql}
GROUP BY user_agent
ORDER BY count DESC
'''
with log_db:
cursor = log_db.execute(query, params_for_filter)
raw_user_agents = cursor.fetchall()
device_counts = {}
for (ua, cnt) in raw_user_agents:
device = get_device_type(ua)
device_counts[device] = device_counts.get(device, 0) + cnt
user_agent_data = [
dict(device=d, count=c) for d, c in device_counts.items()
]
# 5. Parent folder distribution
query = f'''
SELECT rel_path, COUNT(*) AS count
FROM file_access_log
WHERE timestamp >= ? {filetype_filter_sql}
GROUP BY rel_path
ORDER BY count DESC
'''
folder_data_dict = {}
with log_db:
cursor = log_db.execute(query, params_for_filter)
for (rp, c) in cursor.fetchall():
if '/' in rp:
parent_folder = rp.rsplit('/', 1)[0]
else:
parent_folder = "Root"
folder_data_dict[parent_folder] = folder_data_dict.get(parent_folder, 0) + c
folder_data = [dict(folder=f, count=cnt) for f, cnt in folder_data_dict.items()]
folder_data.sort(key=lambda x: x['count'], reverse=True)
folder_data = folder_data[:10]
# 6. Aggregate IP addresses with counts
query = f'''
SELECT ip_address, COUNT(*) as count
FROM file_access_log
WHERE timestamp >= ? {filetype_filter_sql}
GROUP BY ip_address
ORDER BY count DESC
'''
with log_db:
cursor = log_db.execute(query, params_for_filter)
ip_rows = cursor.fetchall()
# Summary stats using separate SQL queries
query = f'SELECT COUNT(*) FROM file_access_log WHERE timestamp >= %s {filetype_filter_sql}'
cursor.execute(query, params)
# 7. Summary stats
# total_accesses
query = f'''
SELECT COUNT(*)
FROM file_access_log
WHERE timestamp >= ? {filetype_filter_sql}
'''
with log_db:
cursor = log_db.execute(query, params_for_filter)
total_accesses = cursor.fetchone()[0]
query = f'SELECT COUNT(DISTINCT rel_path) FROM file_access_log WHERE timestamp >= %s {filetype_filter_sql}'
cursor.execute(query, params)
# unique_files
query = f'''
SELECT COUNT(DISTINCT rel_path)
FROM file_access_log
WHERE timestamp >= ? {filetype_filter_sql}
'''
with log_db:
cursor = log_db.execute(query, params_for_filter)
unique_files = cursor.fetchone()[0]
query = f'SELECT COUNT(DISTINCT device_id) FROM file_access_log WHERE timestamp >= %s {filetype_filter_sql}'
cursor.execute(query, params)
# unique_user
query = f'''
SELECT COUNT(DISTINCT device_id)
FROM file_access_log
WHERE timestamp >= ? {filetype_filter_sql}
'''
with log_db:
cursor = log_db.execute(query, params_for_filter)
unique_user = cursor.fetchone()[0]
# Process location data with GeoIP2.
# 8. Process location data with GeoIP2
reader = geoip2.database.Reader('GeoLite2-City.mmdb')
location_data = {}
for ip, count in ip_rows:
country, city = lookup_location(ip, reader)
location_data_dict = {}
for (ip_addr, cnt) in ip_rows:
country, city = lookup_location(ip_addr, reader)
key = (country, city)
location_data[key] = location_data.get(key, 0) + count
location_data_dict[key] = location_data_dict.get(key, 0) + cnt
reader.close()
location_data = [dict(country=key[0], city=key[1], count=value) for key, value in location_data.items()]
location_data = [
dict(country=k[0], city=k[1], count=v)
for k, v in location_data_dict.items()
]
location_data.sort(key=lambda x: x['count'], reverse=True)
location_data = location_data[:20]
return render_template("dashboard.html",
timeframe=timeframe,
rows=rows,
daily_access_data=daily_access_data,
user_agent_data=user_agent_data,
folder_data=folder_data,
location_data=location_data,
total_accesses=total_accesses,
unique_files=unique_files,
unique_user=unique_user,
timeframe_data=timeframe_data)
# Convert the top-files rows to a list of dictionaries
# (just for consistency in passing to template).
rows = [dict(rel_path=r[0], access_count=r[1]) for r in rows]
return render_template(
"dashboard.html",
timeframe=timeframe,
rows=rows,
daily_access_data=daily_access_data,
user_agent_data=user_agent_data,
folder_data=folder_data,
location_data=location_data,
total_accesses=total_accesses,
unique_files=unique_files,
unique_user=unique_user,
timeframe_data=timeframe_data
)

View File

@ -19,15 +19,8 @@ services:
- FLASK_ENV=production
- TITLE_SHORT=${TITLE_SHORT}
- TITLE_LONG=${TITLE_LONG}
- DB_HOST=postgres-db
- DB_USER=${DB_USER}
- DB_PASSWORD=${DB_PASSWORD}
- DB_NAME=${DB_NAME}
depends_on:
- "postgres"
networks:
- traefik
- internal
labels:
- "traefik.enable=true"
@ -51,22 +44,7 @@ services:
sh -c "pip install -r requirements.txt &&
gunicorn --worker-class eventlet -w 1 -b 0.0.0.0:5000 app:app"
postgres:
image: postgres:17
restart: always
environment:
POSTGRES_USER: ${DB_USER:?}
POSTGRES_PASSWORD: ${DB_PASSWORD:?}
POSTGRES_DB: ${DB_NAME:?}
volumes:
- ./postgres_data:/var/lib/postgresql/data
networks:
internal:
aliases:
- postgres-db
networks:
traefik:
external: true
internal:
internal: true

View File

@ -6,4 +6,3 @@ diskcache
geoip2
gunicorn
eventlet
psycopg2-binary