add cache analyzer
This commit is contained in:
parent
8ee07b57eb
commit
462581b698
472
cache_analyzer.py
Normal file
472
cache_analyzer.py
Normal file
@ -0,0 +1,472 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
cache_analyzer.py (READ-ONLY)
|
||||
|
||||
Analyzes DiskCache-backed caches (audio, image, video, other) by inspecting
|
||||
their SQLite DBs in STRICT READ-ONLY mode (no writes, no cache opens).
|
||||
|
||||
What it shows per cache:
|
||||
- Directory usage (filesystem)
|
||||
- DB stats (page size/count, freelist, tables, row counts)
|
||||
- Total payload (sum of size column)
|
||||
- 10 oldest / 10 latest entries by Cache.access_time
|
||||
- Top 10 largest entries
|
||||
- Size stats: avg, median (approx), min, max
|
||||
- Size distribution buckets
|
||||
- Expiration: expired count, expiring next 24h and next 7d (if expire_time present)
|
||||
- Entries per day (last 14 days) based on access_time
|
||||
- Top 10 prefixes by bytes (prefix = text before first '/'), if key is TEXT
|
||||
|
||||
Usage:
|
||||
python cache_analyzer.py
|
||||
# override paths via env vars if needed:
|
||||
FILECACHE_AUDIO=/path/to/filecache_audio python cache_analyzer.py
|
||||
"""
|
||||
|
||||
import os
|
||||
import sqlite3
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, Any, List, Tuple
|
||||
|
||||
# -------- Config (override with env vars if needed) --------
|
||||
CACHE_DIRS = {
|
||||
"audio": os.environ.get("FILECACHE_AUDIO", "./filecache_audio"),
|
||||
"image": os.environ.get("FILECACHE_IMAGE", "./filecache_image"),
|
||||
"video": os.environ.get("FILECACHE_VIDEO", "./filecache_video"),
|
||||
"other": os.environ.get("FILECACHE_OTHER", "./filecache_other"),
|
||||
}
|
||||
|
||||
# Column heuristics (actual names in your DB appear to include: access_time, expire_time, size, key)
|
||||
TS_COL = "access_time" # per your request: ONLY use Cache.access_time for recency/age
|
||||
EXPIRE_COLS = ["expire_time"] # best-effort
|
||||
SIZE_CANDIDATES = ["size", "bytes", "value_size", "data_size"]
|
||||
KEY_CANDIDATES = ["key", "path", "name", "url", "k"]
|
||||
|
||||
|
||||
def human_bytes(n: Optional[int]) -> str:
|
||||
if n is None:
|
||||
return "N/A"
|
||||
step = 1024.0
|
||||
for unit in ("B", "KB", "MB", "GB", "TB", "PB", "EB"):
|
||||
if n < step:
|
||||
return f"{n:.2f} {unit}"
|
||||
n /= step
|
||||
return f"{n:.2f} ZB"
|
||||
|
||||
|
||||
def print_header(title: str) -> None:
|
||||
line = "=" * max(60, len(title) + 8)
|
||||
print(line)
|
||||
print(f"📦 {title}")
|
||||
print(line)
|
||||
|
||||
|
||||
def dir_usage_bytes(path: str) -> int:
|
||||
total = 0
|
||||
for root, _, files in os.walk(path):
|
||||
for f in files:
|
||||
fp = os.path.join(root, f)
|
||||
try:
|
||||
total += os.path.getsize(fp)
|
||||
except OSError:
|
||||
pass
|
||||
return total
|
||||
|
||||
|
||||
def open_ro(db_path: str) -> Optional[sqlite3.Connection]:
|
||||
try:
|
||||
return sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
|
||||
except sqlite3.Error:
|
||||
return None
|
||||
|
||||
|
||||
def list_tables(cur: sqlite3.Cursor) -> List[str]:
|
||||
cur.execute("""
|
||||
SELECT name FROM sqlite_master
|
||||
WHERE type='table' AND name NOT LIKE 'sqlite_%'
|
||||
ORDER BY name
|
||||
""")
|
||||
return [r[0] for r in cur.fetchall()]
|
||||
|
||||
|
||||
def table_info(cur: sqlite3.Cursor, name: str) -> Dict[str, Dict[str, Any]]:
|
||||
cur.execute(f"PRAGMA table_info({name})")
|
||||
info = {}
|
||||
for cid, cname, ctype, *_ in cur.fetchall():
|
||||
info[cname] = {"cid": cid, "ctype": ctype}
|
||||
return info
|
||||
|
||||
|
||||
def pick_first_present(cols: List[str], options: List[str]) -> Optional[str]:
|
||||
lower = {c.lower(): c for c in cols}
|
||||
for opt in options:
|
||||
if opt in lower:
|
||||
return lower[opt]
|
||||
return None
|
||||
|
||||
|
||||
def fmt_epoch(v: Optional[float]) -> str:
|
||||
if v is None:
|
||||
return "N/A"
|
||||
try:
|
||||
fv = float(v)
|
||||
if fv <= 0 or fv > 1e11:
|
||||
return str(v)
|
||||
return datetime.utcfromtimestamp(fv).isoformat() + "Z"
|
||||
except Exception:
|
||||
return str(v)
|
||||
|
||||
|
||||
def key_preview(v: Any) -> str:
|
||||
if v is None:
|
||||
return "NULL"
|
||||
if isinstance(v, (bytes, bytearray)):
|
||||
try:
|
||||
s = v.decode("utf-8", errors="strict")
|
||||
return s[:120] + ("…" if len(s) > 120 else "")
|
||||
except Exception:
|
||||
hx = v.hex()
|
||||
return "0x" + (hx[:120] + ("…" if len(hx) > 120 else ""))
|
||||
s = str(v)
|
||||
return s[:120] + ("…" if len(s) > 120 else "")
|
||||
|
||||
|
||||
def median_via_sql(cur: sqlite3.Cursor, table: str, size_col: str) -> Optional[float]:
|
||||
# Approximate median using LIMIT/OFFSET (works fine for typical cache sizes)
|
||||
try:
|
||||
cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {size_col} IS NOT NULL")
|
||||
n = cur.fetchone()[0] or 0
|
||||
if n == 0:
|
||||
return None
|
||||
offset = (n - 1) // 2
|
||||
cur.execute(f"""
|
||||
SELECT {size_col} FROM {table}
|
||||
WHERE {size_col} IS NOT NULL
|
||||
ORDER BY {size_col} ASC
|
||||
LIMIT 1 OFFSET {offset}
|
||||
""")
|
||||
row = cur.fetchone()
|
||||
return float(row[0]) if row else None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def top_entries(cur: sqlite3.Cursor, table: str, key_col: Optional[str], size_col: Optional[str],
|
||||
time_col: str, asc: bool, limit: int = 10):
|
||||
order = "ASC" if asc else "DESC"
|
||||
cols = [time_col]
|
||||
if key_col:
|
||||
cols.append(key_col)
|
||||
if size_col:
|
||||
cols.append(size_col)
|
||||
col_list = ", ".join(cols)
|
||||
try:
|
||||
cur.execute(f"""
|
||||
SELECT {col_list} FROM {table}
|
||||
WHERE {time_col} IS NOT NULL
|
||||
ORDER BY {time_col} {order}
|
||||
LIMIT {limit}
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
label = "Oldest" if asc else "Latest"
|
||||
print(f" {label} {limit} by {time_col}:")
|
||||
for r in rows:
|
||||
i = 0
|
||||
t_s = fmt_epoch(r[i]); i += 1
|
||||
k_s = key_preview(r[i]) if key_col else "-"
|
||||
i += 1 if key_col else 0
|
||||
sz_s = human_bytes(int(r[i])) if (size_col and r[i] is not None) else "-"
|
||||
print(f" time={t_s} key={k_s} size={sz_s}")
|
||||
except Exception as e:
|
||||
print(f" ({label.lower()} query error: {e})")
|
||||
|
||||
|
||||
def top_largest(cur: sqlite3.Cursor, table: str, key_col: Optional[str], size_col: Optional[str], limit: int = 10):
|
||||
if not size_col:
|
||||
print(" Top largest: (no size column)")
|
||||
return
|
||||
cols = [size_col]
|
||||
if key_col:
|
||||
cols.append(key_col)
|
||||
col_list = ", ".join(cols)
|
||||
try:
|
||||
cur.execute(f"""
|
||||
SELECT {col_list} FROM {table}
|
||||
WHERE {size_col} IS NOT NULL
|
||||
ORDER BY {size_col} DESC
|
||||
LIMIT {limit}
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
print(f" Top {limit} largest entries:")
|
||||
for r in rows:
|
||||
sz = human_bytes(int(r[0] or 0))
|
||||
key_s = key_preview(r[1]) if key_col else "-"
|
||||
print(f" size={sz} key={key_s}")
|
||||
except Exception as e:
|
||||
print(f" (largest query error: {e})")
|
||||
|
||||
|
||||
def size_stats(cur: sqlite3.Cursor, table: str, size_col: Optional[str]):
|
||||
if not size_col:
|
||||
print(" Size stats: (no size column)")
|
||||
return
|
||||
try:
|
||||
cur.execute(f"""
|
||||
SELECT COUNT(*), SUM({size_col}), AVG({size_col}),
|
||||
MIN({size_col}), MAX({size_col})
|
||||
FROM {table}
|
||||
WHERE {size_col} IS NOT NULL
|
||||
""")
|
||||
cnt, total_b, avg_b, min_b, max_b = cur.fetchone()
|
||||
med_b = median_via_sql(cur, table, size_col)
|
||||
print(" Size stats:")
|
||||
print(f" entries: {cnt}")
|
||||
print(f" total: {human_bytes(int(total_b or 0))}")
|
||||
print(f" avg: {human_bytes(int(avg_b or 0)) if avg_b else 'N/A'}")
|
||||
print(f" median: {human_bytes(int(med_b or 0)) if med_b else 'N/A'}")
|
||||
print(f" min: {human_bytes(int(min_b or 0)) if min_b is not None else 'N/A'}")
|
||||
print(f" max: {human_bytes(int(max_b or 0)) if max_b is not None else 'N/A'}")
|
||||
except Exception as e:
|
||||
print(f" (size stats error: {e})")
|
||||
|
||||
|
||||
def size_distribution(cur: sqlite3.Cursor, table: str, size_col: Optional[str]):
|
||||
if not size_col:
|
||||
print(" Size distribution: (no size column)")
|
||||
return
|
||||
# Buckets: <1MB, 1-10MB, 10-100MB, 100MB-1GB, 1-5GB, >=5GB
|
||||
try:
|
||||
cur.execute(f"""
|
||||
SELECT
|
||||
SUM(CASE WHEN {size_col} < 1024*1024 THEN 1 ELSE 0 END) AS lt_1mb,
|
||||
SUM(CASE WHEN {size_col} >= 1024*1024 AND {size_col} < 10*1024*1024 THEN 1 ELSE 0 END) AS _1_10mb,
|
||||
SUM(CASE WHEN {size_col} >= 10*1024*1024 AND {size_col} < 100*1024*1024 THEN 1 ELSE 0 END) AS _10_100mb,
|
||||
SUM(CASE WHEN {size_col} >= 100*1024*1024 AND {size_col} < 1024*1024*1024 THEN 1 ELSE 0 END) AS _100mb_1gb,
|
||||
SUM(CASE WHEN {size_col} >= 1024*1024*1024 AND {size_col} < 5*1024*1024*1024 THEN 1 ELSE 0 END) AS _1_5gb,
|
||||
SUM(CASE WHEN {size_col} >= 5*1024*1024*1024 THEN 1 ELSE 0 END) AS gte_5gb
|
||||
FROM {table}
|
||||
WHERE {size_col} IS NOT NULL
|
||||
""")
|
||||
(lt1, b1, b2, b3, b4, g5) = [int(x or 0) for x in cur.fetchone()]
|
||||
print(" Size distribution (count):")
|
||||
print(f" <1MB: {lt1}")
|
||||
print(f" 1–10MB: {b1}")
|
||||
print(f" 10–100MB: {b2}")
|
||||
print(f" 100MB–1GB: {b3}")
|
||||
print(f" 1–5GB: {b4}")
|
||||
print(f" ≥5GB: {g5}")
|
||||
except Exception as e:
|
||||
print(f" (size distribution error: {e})")
|
||||
|
||||
|
||||
def expiration_stats(cur: sqlite3.Cursor, table: str, expire_col: Optional[str], now_epoch: int):
|
||||
if not expire_col:
|
||||
print(" Expiration: (no expire column)")
|
||||
return
|
||||
try:
|
||||
cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {expire_col} IS NOT NULL AND {expire_col} < ?", (now_epoch,))
|
||||
expired = cur.fetchone()[0] or 0
|
||||
cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {expire_col} >= ? AND {expire_col} < ?", (now_epoch, now_epoch + 24*3600))
|
||||
exp_24h = cur.fetchone()[0] or 0
|
||||
cur.execute(f"SELECT COUNT(*) FROM {table} WHERE {expire_col} >= ? AND {expire_col} < ?", (now_epoch, now_epoch + 7*24*3600))
|
||||
exp_7d = cur.fetchone()[0] or 0
|
||||
print(" Expiration (by expire_time):")
|
||||
print(f" expired: {expired}")
|
||||
print(f" expiring in 24h: {exp_24h}")
|
||||
print(f" expiring in 7d: {exp_7d}")
|
||||
except Exception as e:
|
||||
print(f" (expiration stats error: {e})")
|
||||
|
||||
|
||||
def entries_per_day(cur: sqlite3.Cursor, table: str, time_col: str, days: int = 14):
|
||||
try:
|
||||
cur.execute(f"""
|
||||
SELECT date({time_col}, 'unixepoch') AS d, COUNT(*)
|
||||
FROM {table}
|
||||
WHERE {time_col} IS NOT NULL
|
||||
GROUP BY d
|
||||
ORDER BY d DESC
|
||||
LIMIT {days}
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
print(f" Entries per day (last {days} days):")
|
||||
for d, c in rows:
|
||||
print(f" {d}: {c}")
|
||||
except Exception as e:
|
||||
print(f" (entries-per-day error: {e})")
|
||||
|
||||
|
||||
def top_prefixes_by_bytes(cur: sqlite3.Cursor, table: str, key_col: Optional[str], size_col: Optional[str], limit: int = 10):
|
||||
if not key_col or not size_col:
|
||||
print(" Top prefixes: (need key and size columns)")
|
||||
return
|
||||
# Only consider TEXT keys (skip BLOB keys)
|
||||
try:
|
||||
cur.execute(f"""
|
||||
SELECT
|
||||
CASE
|
||||
WHEN typeof({key_col})='text' AND instr({key_col}, '/')>0
|
||||
THEN substr({key_col}, 1, instr({key_col}, '/')-1)
|
||||
WHEN typeof({key_col})='text'
|
||||
THEN {key_col}
|
||||
ELSE '(non-text)'
|
||||
END AS prefix,
|
||||
SUM({size_col}) AS total_bytes,
|
||||
COUNT(*) AS cnt
|
||||
FROM {table}
|
||||
WHERE {size_col} IS NOT NULL
|
||||
GROUP BY prefix
|
||||
ORDER BY total_bytes DESC
|
||||
LIMIT {limit}
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
print(f" Top {limit} prefixes by bytes:")
|
||||
for prefix, total_b, cnt in rows:
|
||||
print(f" {prefix}: {human_bytes(int(total_b or 0))} across {cnt} entries")
|
||||
except Exception as e:
|
||||
print(f" (top prefixes error: {e})")
|
||||
|
||||
|
||||
def analyze_sqlite_db(db_path: str) -> None:
|
||||
print(f" • DB path: {db_path}")
|
||||
if not os.path.exists(db_path):
|
||||
print(" Status: ❌ not found")
|
||||
return
|
||||
|
||||
# filesystem size (db file)
|
||||
try:
|
||||
fsize = os.path.getsize(db_path)
|
||||
print(f" File size: {human_bytes(fsize)}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
conn = open_ro(db_path)
|
||||
if not conn:
|
||||
print(" Status: ❌ open failed (read-only)")
|
||||
return
|
||||
|
||||
now_epoch = int(time.time())
|
||||
|
||||
try:
|
||||
cur = conn.cursor()
|
||||
# SQLite stats
|
||||
cur.execute("PRAGMA page_size;")
|
||||
page_size = cur.fetchone()[0]
|
||||
cur.execute("PRAGMA page_count;")
|
||||
page_count = cur.fetchone()[0]
|
||||
cur.execute("PRAGMA freelist_count;")
|
||||
freelist_count = cur.fetchone()[0]
|
||||
payload = page_size * page_count
|
||||
print(f" Page size: {page_size} B")
|
||||
print(f" Page count: {page_count}")
|
||||
print(f" Payload size: {human_bytes(payload)}")
|
||||
print(f" Freelist pages: {freelist_count}")
|
||||
|
||||
# tables
|
||||
tables = list_tables(cur)
|
||||
if not tables:
|
||||
print(" Tables: (none)")
|
||||
return
|
||||
|
||||
print(" Tables:")
|
||||
for t in tables:
|
||||
print(f" - {t}")
|
||||
|
||||
# row counts
|
||||
print(" Row counts:")
|
||||
for t in tables:
|
||||
try:
|
||||
cur.execute(f"SELECT COUNT(*) FROM {t}")
|
||||
cnt = cur.fetchone()[0]
|
||||
print(f" {t:<20} {cnt}")
|
||||
except Exception as e:
|
||||
print(f" {t:<20} (error: {e})")
|
||||
|
||||
# choose entries table (DiskCache default is 'Cache')
|
||||
probe = next((n for n in tables if n.lower() == "cache"), None) or tables[0]
|
||||
cols_info = table_info(cur, probe)
|
||||
colnames = list(cols_info.keys())
|
||||
lower = [c.lower() for c in colnames]
|
||||
|
||||
# prefer 'access_time' (required by your request)
|
||||
time_col = next((c for c in colnames if c.lower() == TS_COL.lower()), None)
|
||||
if not time_col:
|
||||
print(" Note: No 'access_time' column found; skipping chronology-based lists.")
|
||||
# choose size & key columns
|
||||
size_col_l = pick_first_present(lower, [c.lower() for c in SIZE_CANDIDATES])
|
||||
size_col = next((c for c in colnames if c.lower() == size_col_l), None) if size_col_l else None
|
||||
key_col_l = pick_first_present(lower, [c.lower() for c in KEY_CANDIDATES])
|
||||
key_col = next((c for c in colnames if c.lower() == key_col_l), None) if key_col_l else None
|
||||
|
||||
# print total payload
|
||||
if size_col:
|
||||
try:
|
||||
cur.execute(f"SELECT SUM({size_col}) FROM {probe}")
|
||||
total_b = cur.fetchone()[0]
|
||||
print(f" Total payload: {human_bytes(int(total_b or 0))} (sum of {probe}.{size_col})")
|
||||
except Exception as e:
|
||||
print(f" Total payload: (error: {e})")
|
||||
else:
|
||||
print(f" Total payload: (no size column detected)")
|
||||
|
||||
# chronology lists (ONLY by access_time)
|
||||
if time_col:
|
||||
top_entries(cur, probe, key_col, size_col, time_col, asc=True, limit=10)
|
||||
top_entries(cur, probe, key_col, size_col, time_col, asc=False, limit=10)
|
||||
|
||||
# largest entries
|
||||
top_largest(cur, probe, key_col, size_col, limit=10)
|
||||
|
||||
# size stats & distribution
|
||||
size_stats(cur, probe, size_col)
|
||||
size_distribution(cur, probe, size_col)
|
||||
|
||||
# expiration
|
||||
expire_col_l = pick_first_present(lower, [c.lower() for c in EXPIRE_COLS])
|
||||
expire_col = next((c for c in colnames if c.lower() == expire_col_l), None) if expire_col_l else None
|
||||
expiration_stats(cur, probe, expire_col, now_epoch)
|
||||
|
||||
# entries per day (last 14d) using access_time
|
||||
if time_col:
|
||||
entries_per_day(cur, probe, time_col, days=14)
|
||||
|
||||
# top prefixes by bytes (if key is TEXT-like)
|
||||
top_prefixes_by_bytes(cur, probe, key_col, size_col, limit=10)
|
||||
|
||||
finally:
|
||||
try:
|
||||
conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def analyze_cache(label: str, directory: str) -> None:
|
||||
print_header(f"{label.upper()} CACHE — {directory}")
|
||||
if not os.path.isdir(directory):
|
||||
print("Directory status: ❌ not found")
|
||||
return
|
||||
|
||||
# On-disk usage (directory walk, read-only)
|
||||
try:
|
||||
usage = dir_usage_bytes(directory)
|
||||
print(f"Directory usage: {human_bytes(usage)}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Read-only inspection of sqlite DB (DiskCache uses cache.db)
|
||||
analyze_sqlite_db(os.path.join(directory, "cache.db"))
|
||||
print() # spacer
|
||||
|
||||
|
||||
def analyze_all_caches() -> None:
|
||||
for label, path in CACHE_DIRS.items():
|
||||
analyze_cache(label, path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print_header("ALL CACHES OVERVIEW")
|
||||
for k, v in CACHE_DIRS.items():
|
||||
print(f"- {k:<5} -> {v}")
|
||||
print()
|
||||
analyze_all_caches()
|
||||
Loading…
x
Reference in New Issue
Block a user