Merge branch 'development' of gitea.centx.de:lelo/bethaus-app into development

transcribe multilangual inside one file
2025-06-15 22:09:47 +02:00 · 2025-06-15 22:09:38 +02:00
1 changed files with 79 additions and 66 deletions
--- a/transcribe_all.py
+++ b/transcribe_all.py
@ -107,44 +107,7 @@ def print_speed(current_length):
        trans_speed = 0
        
    print(f" | Speed: {int(trans_speed)} minutes per hour | ", end='', flush=True)
-    
-def write_markdown(file_path, result, postfix=None):
-    file_dir = os.path.dirname(file_path)
-    txt_folder = os.path.join(file_dir, "Transkription")
-    os.makedirs(txt_folder, exist_ok=True)
-    base_name = os.path.splitext(os.path.basename(file_path))[0]
-    if postfix != None:
-        base_name = f"{base_name}_{postfix}"
-    output_md = os.path.join(txt_folder, base_name + ".md")
-    
-    # Prepare the markdown content.
-    folder_name = os.path.basename(file_dir)
-    md_lines = [
-        f"### {folder_name}",
-        f"#### {os.path.basename(file_path)}",
-        "---",
-        ""
-    ]
-    
-    previous_text = ""
-    for segment in result["segments"]:
-        start = format_timestamp(segment["start"])
-        text = segment["text"].strip()
-        if previous_text != text: # suppress repeating lines
-            md_lines.append(f"`{start}` {text}")
-        previous_text = text
-    
-    transcript_md = "\n".join(md_lines)
-    
-    transcript_md = apply_error_correction(transcript_md)
-    
-    transcript_md = remove_lines_with_words(transcript_md)
-    
-    with open(output_md, "w", encoding="utf-8") as f:
-        f.write(transcript_md)
-    
-    print_speed(result["segments"][-1]["end"])
-    print(f"... done !")
+

 def transcribe_file(model, audio_input, language):
    initial_prompt = (
@ -170,33 +133,85 @@ def detect_language(model, audio):
    print(f"{lang_code}. ", end='', flush=True)
    return lang_code

+
 def process_file(file_path, model, audio_input):
+    """
+    Transcribe the audio file into one markdown file.
+    If special case (German sermon in Russian or Russian-marked file), transcribe both in Russian and German into the same file.
+    """
    file_name = os.path.basename(file_path)
-    
-    # default values
-    postfix = None
-    language = detect_language(model, audio_input)
-        
-    if language == 'ru' and 'predigt' in file_name.lower() or language == 'de' and 'russisch' in file_name.lower(): # make two files
-        # first file
-        language="ru"
-        postfix = "ru"
-        print(f"Transcribing {format_status_path(file_path)} ", end='', flush=True)
-        markdown = transcribe_file(model, audio_input, language)
-        write_markdown(file_path, markdown, postfix)
-        # second file
-        language="de"
-        postfix = "de"
-    elif language == 'en': # songs mostly detect as english
-        language="de"
-    elif language == 'de' or language == 'ru': # keep as detected
-        pass
-    else: # not german not english and not russian. --> russina
-        language="ru"
-            
-    print(f"Transcribing {format_status_path(file_path)} ", end='', flush=True)       
-    markdown = transcribe_file(model, audio_input, language)
-    write_markdown(file_path, markdown, postfix)
+
+    # Detect spoken language
+    detected = detect_language(model, audio_input)
+
+    # Determine which languages to transcribe
+    if (detected == 'ru' and 'predigt' in file_name.lower()) or \
+       (detected == 'de' and 'russisch' in file_name.lower()):
+        langs = ['ru', 'de']
+    elif detected == 'en':  # songs often mis-detected as English
+        langs = ['de']
+    elif detected in ('de', 'ru'):
+        langs = [detected]
+    else:
+        langs = ['ru']
+
+    # Collect segments for combined result
+    combined_segments = []
+    for lang in langs:
+        print(f"Transcribing {format_status_path(file_path)} as {lang}", end='', flush=True)
+        result = transcribe_file(model, audio_input, lang)
+
+        # Insert a synthetic segment for language header
+        combined_segments.append({
+            'start': 0,
+            'text': f"\n## Transcription ({lang.upper()})",
+        })
+
+        # Extend with actual segments
+        if isinstance(result, dict) and 'segments' in result:
+            combined_segments.extend(result['segments'])
+        else:
+            # If result isn't dict-of-segments, wrap entire text
+            text = getattr(result, 'text', None) or (result.get('text') if isinstance(result, dict) else str(result))
+            combined_segments.append({'start': 0, 'text': text})
+
+    # Now write out markdown using the combined segments
+    file_dir = os.path.dirname(file_path)
+    txt_folder = os.path.join(file_dir, "Transkription")
+    os.makedirs(txt_folder, exist_ok=True)
+    base_name = os.path.splitext(os.path.basename(file_path))[0]
+    output_md = os.path.join(txt_folder, base_name + ".md")
+
+    # Build markdown lines
+    folder_name = os.path.basename(file_dir)
+    md_lines = [
+        f"### {folder_name}",
+        f"#### {os.path.basename(file_path)}",
+        "---",
+        ""
+    ]
+    previous_text = ""
+    for segment in combined_segments:
+        start = format_timestamp(segment.get('start', 0))
+        text = segment.get('text', '').strip()
+        if text and text != previous_text:
+            md_lines.append(f"`{start}` {text}")
+            previous_text = text
+
+    # Join and post-process
+    transcript_md = "\n".join(md_lines)
+    transcript_md = apply_error_correction(transcript_md)
+    transcript_md = remove_lines_with_words(transcript_md)
+
+    # Write file and report
+    with open(output_md, "w", encoding="utf-8") as f:
+        f.write(transcript_md)
+
+    if combined_segments:
+        end_ts = combined_segments[-1].get('end', combined_segments[-1].get('start', 0))
+        print_speed(end_ts)
+    print("... done !")
+

 def process_folder(root_folder):
    """
@ -225,10 +240,8 @@ def process_folder(root_folder):
                txt_folder = os.path.join(dirpath, "Transkription")
                base_name = os.path.splitext(os.path.basename(file_path))[0]
                output_md = os.path.join(txt_folder, base_name + ".md")
-                output_md_de = os.path.join(txt_folder, base_name + "_de.md")
-                output_md_ru = os.path.join(txt_folder, base_name + "_ru.md")
                # skip files with existing md files
-                if os.path.exists(output_md) or os.path.exists(output_md_de) or os.path.exists(output_md_ru):
+                if os.path.exists(output_md):
                    continue
                
                valid_files.append(file_path)
Author	SHA1	Message	Date
lelo	a7effaec8f	Merge branch 'development' of gitea.centx.de:lelo/bethaus-app into development	2025-06-15 22:09:47 +02:00
lelo	e3408bf389	transcribe multilangual inside one file	2025-06-15 22:09:38 +02:00