add crawler
This commit is contained in:
parent
3fa915a9ce
commit
d233e66268
62
app.py
62
app.py
@ -424,6 +424,68 @@ def get_transcript(filename):
|
|||||||
content = f.read()
|
content = f.read()
|
||||||
return content, 200, {'Content-Type': 'text/markdown; charset=utf-8'}
|
return content, 200, {'Content-Type': 'text/markdown; charset=utf-8'}
|
||||||
|
|
||||||
|
@app.route("/crawl/<path:start_relative_path>")
|
||||||
|
@require_secret
|
||||||
|
def crawl_and_cache(start_relative_path):
|
||||||
|
"""
|
||||||
|
Crawls through a directory (relative to app.config['FILE_ROOT']) and caches each file.
|
||||||
|
For images, it creates a thumbnail (max 1200x1200) and caches the processed image.
|
||||||
|
For non-images, it simply reads and caches the file bytes.
|
||||||
|
|
||||||
|
:param start_relative_path: The folder (relative to FILE_ROOT) to start crawling.
|
||||||
|
"""
|
||||||
|
# Compute the absolute path for the starting directory
|
||||||
|
base_dir = os.path.normpath(os.path.join(app.config['FILE_ROOT'], start_relative_path))
|
||||||
|
|
||||||
|
# Check that base_dir is under FILE_ROOT to prevent directory traversal
|
||||||
|
if not base_dir.startswith(os.path.abspath(app.config['FILE_ROOT'])):
|
||||||
|
return jsonify({"error": "Invalid path"}), 400
|
||||||
|
|
||||||
|
cached_files = [] # List to hold cached file relative paths
|
||||||
|
|
||||||
|
# Walk through all subdirectories and files
|
||||||
|
for root, dirs, files in os.walk(base_dir):
|
||||||
|
for filename in files:
|
||||||
|
full_path = os.path.join(root, filename)
|
||||||
|
# Compute the relative key used for caching
|
||||||
|
rel_key = os.path.relpath(full_path, app.config['FILE_ROOT'])
|
||||||
|
|
||||||
|
# Skip if this file is already in the cache
|
||||||
|
if cache.get(rel_key):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Determine the MIME type
|
||||||
|
mime, _ = mimetypes.guess_type(full_path)
|
||||||
|
mime = mime or 'application/octet-stream'
|
||||||
|
|
||||||
|
# Process image files differently
|
||||||
|
if mime.startswith('image/'):
|
||||||
|
try:
|
||||||
|
with Image.open(full_path) as img:
|
||||||
|
# Create a thumbnail (max 1200x1200)
|
||||||
|
img.thumbnail((1200, 1200))
|
||||||
|
img_bytes_io = io.BytesIO()
|
||||||
|
# Save processed image as PNG
|
||||||
|
img.save(img_bytes_io, format='PNG', quality=85)
|
||||||
|
img_bytes = img_bytes_io.getvalue()
|
||||||
|
# Cache the processed image bytes along with its mime type
|
||||||
|
cache.set(rel_key, (img_bytes, mime))
|
||||||
|
cached_files.append(rel_key)
|
||||||
|
except Exception as e:
|
||||||
|
app.logger.error(f"Image processing failed for {rel_key}: {e}")
|
||||||
|
else:
|
||||||
|
# Process non-image files
|
||||||
|
try:
|
||||||
|
with open(full_path, 'rb') as f:
|
||||||
|
file_bytes = f.read()
|
||||||
|
cache.set(rel_key, (file_bytes, mime))
|
||||||
|
cached_files.append(rel_key)
|
||||||
|
except Exception as e:
|
||||||
|
app.logger.error(f"Failed to read file {rel_key}: {e}")
|
||||||
|
|
||||||
|
# Return the list of cached files as a JSON response
|
||||||
|
return json.dumps({"cached_files": cached_files}, indent=4), 200
|
||||||
|
|
||||||
# Catch-all route to serve the single-page application template.
|
# Catch-all route to serve the single-page application template.
|
||||||
@app.route('/', defaults={'path': ''})
|
@app.route('/', defaults={'path': ''})
|
||||||
@app.route('/<path:path>')
|
@app.route('/<path:path>')
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user