add crawler

2025-03-18 22:37:50 +00:00 · 2025-03-18 22:37:50 +00:00 · d233e66268
commit d233e66268
parent 3fa915a9ce
1 changed files with 62 additions and 0 deletions
--- a/app.py
+++ b/app.py
@ -424,6 +424,68 @@ def get_transcript(filename):
        content = f.read()
    return content, 200, {'Content-Type': 'text/markdown; charset=utf-8'}
@app.route("/crawl/<path:start_relative_path>")
@require_secret
 def crawl_and_cache(start_relative_path):
    """
    Crawls through a directory (relative to app.config['FILE_ROOT']) and caches each file.
    For images, it creates a thumbnail (max 1200x1200) and caches the processed image.
    For non-images, it simply reads and caches the file bytes.
    :param start_relative_path: The folder (relative to FILE_ROOT) to start crawling.
    """
    # Compute the absolute path for the starting directory
    base_dir = os.path.normpath(os.path.join(app.config['FILE_ROOT'], start_relative_path))
    # Check that base_dir is under FILE_ROOT to prevent directory traversal
    if not base_dir.startswith(os.path.abspath(app.config['FILE_ROOT'])):
        return jsonify({"error": "Invalid path"}), 400
    cached_files = []  # List to hold cached file relative paths
    # Walk through all subdirectories and files
    for root, dirs, files in os.walk(base_dir):
        for filename in files:
            full_path = os.path.join(root, filename)
            # Compute the relative key used for caching
            rel_key = os.path.relpath(full_path, app.config['FILE_ROOT'])
            # Skip if this file is already in the cache
            if cache.get(rel_key):
                continue
            # Determine the MIME type
            mime, _ = mimetypes.guess_type(full_path)
            mime = mime or 'application/octet-stream'
            # Process image files differently
            if mime.startswith('image/'):
                try:
                    with Image.open(full_path) as img:
                        # Create a thumbnail (max 1200x1200)
                        img.thumbnail((1200, 1200))
                        img_bytes_io = io.BytesIO()
                        # Save processed image as PNG
                        img.save(img_bytes_io, format='PNG', quality=85)
                        img_bytes = img_bytes_io.getvalue()
                        # Cache the processed image bytes along with its mime type
                        cache.set(rel_key, (img_bytes, mime))
                        cached_files.append(rel_key)
                except Exception as e:
                    app.logger.error(f"Image processing failed for {rel_key}: {e}")
            else:
                # Process non-image files
                try:
                    with open(full_path, 'rb') as f:
                        file_bytes = f.read()
                    cache.set(rel_key, (file_bytes, mime))
                    cached_files.append(rel_key)
                except Exception as e:
                    app.logger.error(f"Failed to read file {rel_key}: {e}")
    # Return the list of cached files as a JSON response
    return json.dumps({"cached_files": cached_files}, indent=4), 200
 # Catch-all route to serve the single-page application template.
@app.route('/', defaults={'path': ''})
@app.route('/<path:path>')