add crawler

2025-03-18 22:37:50 +00:00 · 2025-03-18 22:37:50 +00:00 · d233e66268
commit d233e66268
parent 3fa915a9ce
1 changed files with 62 additions and 0 deletions
--- a/app.py
+++ b/app.py
@ -424,6 +424,68 @@ def get_transcript(filename):
        content = f.read()
    return content, 200, {'Content-Type': 'text/markdown; charset=utf-8'}

+@app.route("/crawl/<path:start_relative_path>")
+@require_secret
+def crawl_and_cache(start_relative_path):
+    """
+    Crawls through a directory (relative to app.config['FILE_ROOT']) and caches each file.
+    For images, it creates a thumbnail (max 1200x1200) and caches the processed image.
+    For non-images, it simply reads and caches the file bytes.
+    
+    :param start_relative_path: The folder (relative to FILE_ROOT) to start crawling.
+    """
+    # Compute the absolute path for the starting directory
+    base_dir = os.path.normpath(os.path.join(app.config['FILE_ROOT'], start_relative_path))
+    
+    # Check that base_dir is under FILE_ROOT to prevent directory traversal
+    if not base_dir.startswith(os.path.abspath(app.config['FILE_ROOT'])):
+        return jsonify({"error": "Invalid path"}), 400
+
+    cached_files = []  # List to hold cached file relative paths
+
+    # Walk through all subdirectories and files
+    for root, dirs, files in os.walk(base_dir):
+        for filename in files:
+            full_path = os.path.join(root, filename)
+            # Compute the relative key used for caching
+            rel_key = os.path.relpath(full_path, app.config['FILE_ROOT'])
+            
+            # Skip if this file is already in the cache
+            if cache.get(rel_key):
+                continue
+
+            # Determine the MIME type
+            mime, _ = mimetypes.guess_type(full_path)
+            mime = mime or 'application/octet-stream'
+            
+            # Process image files differently
+            if mime.startswith('image/'):
+                try:
+                    with Image.open(full_path) as img:
+                        # Create a thumbnail (max 1200x1200)
+                        img.thumbnail((1200, 1200))
+                        img_bytes_io = io.BytesIO()
+                        # Save processed image as PNG
+                        img.save(img_bytes_io, format='PNG', quality=85)
+                        img_bytes = img_bytes_io.getvalue()
+                        # Cache the processed image bytes along with its mime type
+                        cache.set(rel_key, (img_bytes, mime))
+                        cached_files.append(rel_key)
+                except Exception as e:
+                    app.logger.error(f"Image processing failed for {rel_key}: {e}")
+            else:
+                # Process non-image files
+                try:
+                    with open(full_path, 'rb') as f:
+                        file_bytes = f.read()
+                    cache.set(rel_key, (file_bytes, mime))
+                    cached_files.append(rel_key)
+                except Exception as e:
+                    app.logger.error(f"Failed to read file {rel_key}: {e}")
+    
+    # Return the list of cached files as a JSON response
+    return json.dumps({"cached_files": cached_files}, indent=4), 200
+
 # Catch-all route to serve the single-page application template.
@app.route('/', defaults={'path': ''})
@app.route('/<path:path>')