add crawler
This commit is contained in:
parent
3fa915a9ce
commit
d233e66268
62
app.py
62
app.py
@ -424,6 +424,68 @@ def get_transcript(filename):
|
||||
content = f.read()
|
||||
return content, 200, {'Content-Type': 'text/markdown; charset=utf-8'}
|
||||
|
||||
@app.route("/crawl/<path:start_relative_path>")
|
||||
@require_secret
|
||||
def crawl_and_cache(start_relative_path):
|
||||
"""
|
||||
Crawls through a directory (relative to app.config['FILE_ROOT']) and caches each file.
|
||||
For images, it creates a thumbnail (max 1200x1200) and caches the processed image.
|
||||
For non-images, it simply reads and caches the file bytes.
|
||||
|
||||
:param start_relative_path: The folder (relative to FILE_ROOT) to start crawling.
|
||||
"""
|
||||
# Compute the absolute path for the starting directory
|
||||
base_dir = os.path.normpath(os.path.join(app.config['FILE_ROOT'], start_relative_path))
|
||||
|
||||
# Check that base_dir is under FILE_ROOT to prevent directory traversal
|
||||
if not base_dir.startswith(os.path.abspath(app.config['FILE_ROOT'])):
|
||||
return jsonify({"error": "Invalid path"}), 400
|
||||
|
||||
cached_files = [] # List to hold cached file relative paths
|
||||
|
||||
# Walk through all subdirectories and files
|
||||
for root, dirs, files in os.walk(base_dir):
|
||||
for filename in files:
|
||||
full_path = os.path.join(root, filename)
|
||||
# Compute the relative key used for caching
|
||||
rel_key = os.path.relpath(full_path, app.config['FILE_ROOT'])
|
||||
|
||||
# Skip if this file is already in the cache
|
||||
if cache.get(rel_key):
|
||||
continue
|
||||
|
||||
# Determine the MIME type
|
||||
mime, _ = mimetypes.guess_type(full_path)
|
||||
mime = mime or 'application/octet-stream'
|
||||
|
||||
# Process image files differently
|
||||
if mime.startswith('image/'):
|
||||
try:
|
||||
with Image.open(full_path) as img:
|
||||
# Create a thumbnail (max 1200x1200)
|
||||
img.thumbnail((1200, 1200))
|
||||
img_bytes_io = io.BytesIO()
|
||||
# Save processed image as PNG
|
||||
img.save(img_bytes_io, format='PNG', quality=85)
|
||||
img_bytes = img_bytes_io.getvalue()
|
||||
# Cache the processed image bytes along with its mime type
|
||||
cache.set(rel_key, (img_bytes, mime))
|
||||
cached_files.append(rel_key)
|
||||
except Exception as e:
|
||||
app.logger.error(f"Image processing failed for {rel_key}: {e}")
|
||||
else:
|
||||
# Process non-image files
|
||||
try:
|
||||
with open(full_path, 'rb') as f:
|
||||
file_bytes = f.read()
|
||||
cache.set(rel_key, (file_bytes, mime))
|
||||
cached_files.append(rel_key)
|
||||
except Exception as e:
|
||||
app.logger.error(f"Failed to read file {rel_key}: {e}")
|
||||
|
||||
# Return the list of cached files as a JSON response
|
||||
return json.dumps({"cached_files": cached_files}, indent=4), 200
|
||||
|
||||
# Catch-all route to serve the single-page application template.
|
||||
@app.route('/', defaults={'path': ''})
|
||||
@app.route('/<path:path>')
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user