From d233e662682d7cf8adf91030e1c5d32b210bd0be Mon Sep 17 00:00:00 2001 From: lelo Date: Tue, 18 Mar 2025 22:37:50 +0000 Subject: [PATCH] add crawler --- app.py | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/app.py b/app.py index e5b3889..d0302c7 100755 --- a/app.py +++ b/app.py @@ -424,6 +424,68 @@ def get_transcript(filename): content = f.read() return content, 200, {'Content-Type': 'text/markdown; charset=utf-8'} +@app.route("/crawl/") +@require_secret +def crawl_and_cache(start_relative_path): + """ + Crawls through a directory (relative to app.config['FILE_ROOT']) and caches each file. + For images, it creates a thumbnail (max 1200x1200) and caches the processed image. + For non-images, it simply reads and caches the file bytes. + + :param start_relative_path: The folder (relative to FILE_ROOT) to start crawling. + """ + # Compute the absolute path for the starting directory + base_dir = os.path.normpath(os.path.join(app.config['FILE_ROOT'], start_relative_path)) + + # Check that base_dir is under FILE_ROOT to prevent directory traversal + if not base_dir.startswith(os.path.abspath(app.config['FILE_ROOT'])): + return jsonify({"error": "Invalid path"}), 400 + + cached_files = [] # List to hold cached file relative paths + + # Walk through all subdirectories and files + for root, dirs, files in os.walk(base_dir): + for filename in files: + full_path = os.path.join(root, filename) + # Compute the relative key used for caching + rel_key = os.path.relpath(full_path, app.config['FILE_ROOT']) + + # Skip if this file is already in the cache + if cache.get(rel_key): + continue + + # Determine the MIME type + mime, _ = mimetypes.guess_type(full_path) + mime = mime or 'application/octet-stream' + + # Process image files differently + if mime.startswith('image/'): + try: + with Image.open(full_path) as img: + # Create a thumbnail (max 1200x1200) + img.thumbnail((1200, 1200)) + img_bytes_io = io.BytesIO() + # Save processed image as PNG + img.save(img_bytes_io, format='PNG', quality=85) + img_bytes = img_bytes_io.getvalue() + # Cache the processed image bytes along with its mime type + cache.set(rel_key, (img_bytes, mime)) + cached_files.append(rel_key) + except Exception as e: + app.logger.error(f"Image processing failed for {rel_key}: {e}") + else: + # Process non-image files + try: + with open(full_path, 'rb') as f: + file_bytes = f.read() + cache.set(rel_key, (file_bytes, mime)) + cached_files.append(rel_key) + except Exception as e: + app.logger.error(f"Failed to read file {rel_key}: {e}") + + # Return the list of cached files as a JSON response + return json.dumps({"cached_files": cached_files}, indent=4), 200 + # Catch-all route to serve the single-page application template. @app.route('/', defaults={'path': ''}) @app.route('/')