add crawler

This commit is contained in:
lelo 2025-03-18 22:37:50 +00:00
parent 3fa915a9ce
commit d233e66268

62
app.py
View File

@ -424,6 +424,68 @@ def get_transcript(filename):
content = f.read() content = f.read()
return content, 200, {'Content-Type': 'text/markdown; charset=utf-8'} return content, 200, {'Content-Type': 'text/markdown; charset=utf-8'}
@app.route("/crawl/<path:start_relative_path>")
@require_secret
def crawl_and_cache(start_relative_path):
"""
Crawls through a directory (relative to app.config['FILE_ROOT']) and caches each file.
For images, it creates a thumbnail (max 1200x1200) and caches the processed image.
For non-images, it simply reads and caches the file bytes.
:param start_relative_path: The folder (relative to FILE_ROOT) to start crawling.
"""
# Compute the absolute path for the starting directory
base_dir = os.path.normpath(os.path.join(app.config['FILE_ROOT'], start_relative_path))
# Check that base_dir is under FILE_ROOT to prevent directory traversal
if not base_dir.startswith(os.path.abspath(app.config['FILE_ROOT'])):
return jsonify({"error": "Invalid path"}), 400
cached_files = [] # List to hold cached file relative paths
# Walk through all subdirectories and files
for root, dirs, files in os.walk(base_dir):
for filename in files:
full_path = os.path.join(root, filename)
# Compute the relative key used for caching
rel_key = os.path.relpath(full_path, app.config['FILE_ROOT'])
# Skip if this file is already in the cache
if cache.get(rel_key):
continue
# Determine the MIME type
mime, _ = mimetypes.guess_type(full_path)
mime = mime or 'application/octet-stream'
# Process image files differently
if mime.startswith('image/'):
try:
with Image.open(full_path) as img:
# Create a thumbnail (max 1200x1200)
img.thumbnail((1200, 1200))
img_bytes_io = io.BytesIO()
# Save processed image as PNG
img.save(img_bytes_io, format='PNG', quality=85)
img_bytes = img_bytes_io.getvalue()
# Cache the processed image bytes along with its mime type
cache.set(rel_key, (img_bytes, mime))
cached_files.append(rel_key)
except Exception as e:
app.logger.error(f"Image processing failed for {rel_key}: {e}")
else:
# Process non-image files
try:
with open(full_path, 'rb') as f:
file_bytes = f.read()
cache.set(rel_key, (file_bytes, mime))
cached_files.append(rel_key)
except Exception as e:
app.logger.error(f"Failed to read file {rel_key}: {e}")
# Return the list of cached files as a JSON response
return json.dumps({"cached_files": cached_files}, indent=4), 200
# Catch-all route to serve the single-page application template. # Catch-all route to serve the single-page application template.
@app.route('/', defaults={'path': ''}) @app.route('/', defaults={'path': ''})
@app.route('/<path:path>') @app.route('/<path:path>')