transcribe multilangual inside one file

2025-06-15 22:09:38 +02:00 · 2025-06-15 22:09:38 +02:00 · e3408bf389
commit e3408bf389
parent 5f57de04c8
1 changed files with 79 additions and 66 deletions
--- a/transcribe_all.py
+++ b/transcribe_all.py
@ -107,44 +107,7 @@ def print_speed(current_length):
        trans_speed = 0
    print(f" | Speed: {int(trans_speed)} minutes per hour | ", end='', flush=True)
-    
+
 def write_markdown(file_path, result, postfix=None):
    file_dir = os.path.dirname(file_path)
    txt_folder = os.path.join(file_dir, "Transkription")
    os.makedirs(txt_folder, exist_ok=True)
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    if postfix != None:
        base_name = f"{base_name}_{postfix}"
    output_md = os.path.join(txt_folder, base_name + ".md")
    # Prepare the markdown content.
    folder_name = os.path.basename(file_dir)
    md_lines = [
        f"### {folder_name}",
        f"#### {os.path.basename(file_path)}",
        "---",
        ""
    ]
    previous_text = ""
    for segment in result["segments"]:
        start = format_timestamp(segment["start"])
        text = segment["text"].strip()
        if previous_text != text: # suppress repeating lines
            md_lines.append(f"`{start}` {text}")
        previous_text = text
    transcript_md = "\n".join(md_lines)
    transcript_md = apply_error_correction(transcript_md)
    transcript_md = remove_lines_with_words(transcript_md)
    with open(output_md, "w", encoding="utf-8") as f:
        f.write(transcript_md)
    print_speed(result["segments"][-1]["end"])
    print(f"... done !")
 def transcribe_file(model, audio_input, language):
    initial_prompt = (
@ -170,33 +133,85 @@ def detect_language(model, audio):
    print(f"{lang_code}. ", end='', flush=True)
    return lang_code
 def process_file(file_path, model, audio_input):
    """
    Transcribe the audio file into one markdown file.
    If special case (German sermon in Russian or Russian-marked file), transcribe both in Russian and German into the same file.
    """
    file_name = os.path.basename(file_path)
-    
+
-    # default values
+    # Detect spoken language
-    postfix = None
+    detected = detect_language(model, audio_input)
-    language = detect_language(model, audio_input)
+
-        
+    # Determine which languages to transcribe
-    if language == 'ru' and 'predigt' in file_name.lower() or language == 'de' and 'russisch' in file_name.lower(): # make two files
+    if (detected == 'ru' and 'predigt' in file_name.lower()) or \
-        # first file
+       (detected == 'de' and 'russisch' in file_name.lower()):
-        language="ru"
+        langs = ['ru', 'de']
-        postfix = "ru"
+    elif detected == 'en':  # songs often mis-detected as English
-        print(f"Transcribing {format_status_path(file_path)} ", end='', flush=True)
+        langs = ['de']
-        markdown = transcribe_file(model, audio_input, language)
+    elif detected in ('de', 'ru'):
-        write_markdown(file_path, markdown, postfix)
+        langs = [detected]
-        # second file
+    else:
-        language="de"
+        langs = ['ru']
-        postfix = "de"
+
-    elif language == 'en': # songs mostly detect as english
+    # Collect segments for combined result
-        language="de"
+    combined_segments = []
-    elif language == 'de' or language == 'ru': # keep as detected
+    for lang in langs:
-        pass
+        print(f"Transcribing {format_status_path(file_path)} as {lang}", end='', flush=True)
-    else: # not german not english and not russian. --> russina
+        result = transcribe_file(model, audio_input, lang)
-        language="ru"
+
-            
+        # Insert a synthetic segment for language header
-    print(f"Transcribing {format_status_path(file_path)} ", end='', flush=True)       
+        combined_segments.append({
-    markdown = transcribe_file(model, audio_input, language)
+            'start': 0,
-    write_markdown(file_path, markdown, postfix)
+            'text': f"\n## Transcription ({lang.upper()})",
        })
        # Extend with actual segments
        if isinstance(result, dict) and 'segments' in result:
            combined_segments.extend(result['segments'])
        else:
            # If result isn't dict-of-segments, wrap entire text
            text = getattr(result, 'text', None) or (result.get('text') if isinstance(result, dict) else str(result))
            combined_segments.append({'start': 0, 'text': text})
    # Now write out markdown using the combined segments
    file_dir = os.path.dirname(file_path)
    txt_folder = os.path.join(file_dir, "Transkription")
    os.makedirs(txt_folder, exist_ok=True)
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    output_md = os.path.join(txt_folder, base_name + ".md")
    # Build markdown lines
    folder_name = os.path.basename(file_dir)
    md_lines = [
        f"### {folder_name}",
        f"#### {os.path.basename(file_path)}",
        "---",
        ""
    ]
    previous_text = ""
    for segment in combined_segments:
        start = format_timestamp(segment.get('start', 0))
        text = segment.get('text', '').strip()
        if text and text != previous_text:
            md_lines.append(f"`{start}` {text}")
            previous_text = text
    # Join and post-process
    transcript_md = "\n".join(md_lines)
    transcript_md = apply_error_correction(transcript_md)
    transcript_md = remove_lines_with_words(transcript_md)
    # Write file and report
    with open(output_md, "w", encoding="utf-8") as f:
        f.write(transcript_md)
    if combined_segments:
        end_ts = combined_segments[-1].get('end', combined_segments[-1].get('start', 0))
        print_speed(end_ts)
    print("... done !")
 def process_folder(root_folder):
    """
@ -225,10 +240,8 @@ def process_folder(root_folder):
                txt_folder = os.path.join(dirpath, "Transkription")
                base_name = os.path.splitext(os.path.basename(file_path))[0]
                output_md = os.path.join(txt_folder, base_name + ".md")
                output_md_de = os.path.join(txt_folder, base_name + "_de.md")
                output_md_ru = os.path.join(txt_folder, base_name + "_ru.md")
                # skip files with existing md files
-                if os.path.exists(output_md) or os.path.exists(output_md_de) or os.path.exists(output_md_ru):
+                if os.path.exists(output_md):
                    continue
                valid_files.append(file_path)