Compare commits
No commits in common. "a7effaec8f4da599aa715dce16bc1673edb1bb83" and "ce0be76b70ea9c7085aca758805423672af06ea3" have entirely different histories.
a7effaec8f
...
ce0be76b70
@ -107,7 +107,44 @@ def print_speed(current_length):
|
|||||||
trans_speed = 0
|
trans_speed = 0
|
||||||
|
|
||||||
print(f" | Speed: {int(trans_speed)} minutes per hour | ", end='', flush=True)
|
print(f" | Speed: {int(trans_speed)} minutes per hour | ", end='', flush=True)
|
||||||
|
|
||||||
|
def write_markdown(file_path, result, postfix=None):
|
||||||
|
file_dir = os.path.dirname(file_path)
|
||||||
|
txt_folder = os.path.join(file_dir, "Transkription")
|
||||||
|
os.makedirs(txt_folder, exist_ok=True)
|
||||||
|
base_name = os.path.splitext(os.path.basename(file_path))[0]
|
||||||
|
if postfix != None:
|
||||||
|
base_name = f"{base_name}_{postfix}"
|
||||||
|
output_md = os.path.join(txt_folder, base_name + ".md")
|
||||||
|
|
||||||
|
# Prepare the markdown content.
|
||||||
|
folder_name = os.path.basename(file_dir)
|
||||||
|
md_lines = [
|
||||||
|
f"### {folder_name}",
|
||||||
|
f"#### {os.path.basename(file_path)}",
|
||||||
|
"---",
|
||||||
|
""
|
||||||
|
]
|
||||||
|
|
||||||
|
previous_text = ""
|
||||||
|
for segment in result["segments"]:
|
||||||
|
start = format_timestamp(segment["start"])
|
||||||
|
text = segment["text"].strip()
|
||||||
|
if previous_text != text: # suppress repeating lines
|
||||||
|
md_lines.append(f"`{start}` {text}")
|
||||||
|
previous_text = text
|
||||||
|
|
||||||
|
transcript_md = "\n".join(md_lines)
|
||||||
|
|
||||||
|
transcript_md = apply_error_correction(transcript_md)
|
||||||
|
|
||||||
|
transcript_md = remove_lines_with_words(transcript_md)
|
||||||
|
|
||||||
|
with open(output_md, "w", encoding="utf-8") as f:
|
||||||
|
f.write(transcript_md)
|
||||||
|
|
||||||
|
print_speed(result["segments"][-1]["end"])
|
||||||
|
print(f"... done !")
|
||||||
|
|
||||||
def transcribe_file(model, audio_input, language):
|
def transcribe_file(model, audio_input, language):
|
||||||
initial_prompt = (
|
initial_prompt = (
|
||||||
@ -133,85 +170,33 @@ def detect_language(model, audio):
|
|||||||
print(f"{lang_code}. ", end='', flush=True)
|
print(f"{lang_code}. ", end='', flush=True)
|
||||||
return lang_code
|
return lang_code
|
||||||
|
|
||||||
|
|
||||||
def process_file(file_path, model, audio_input):
|
def process_file(file_path, model, audio_input):
|
||||||
"""
|
|
||||||
Transcribe the audio file into one markdown file.
|
|
||||||
If special case (German sermon in Russian or Russian-marked file), transcribe both in Russian and German into the same file.
|
|
||||||
"""
|
|
||||||
file_name = os.path.basename(file_path)
|
file_name = os.path.basename(file_path)
|
||||||
|
|
||||||
# Detect spoken language
|
# default values
|
||||||
detected = detect_language(model, audio_input)
|
postfix = None
|
||||||
|
language = detect_language(model, audio_input)
|
||||||
# Determine which languages to transcribe
|
|
||||||
if (detected == 'ru' and 'predigt' in file_name.lower()) or \
|
if language == 'ru' and 'predigt' in file_name.lower() or language == 'de' and 'russisch' in file_name.lower(): # make two files
|
||||||
(detected == 'de' and 'russisch' in file_name.lower()):
|
# first file
|
||||||
langs = ['ru', 'de']
|
language="ru"
|
||||||
elif detected == 'en': # songs often mis-detected as English
|
postfix = "ru"
|
||||||
langs = ['de']
|
print(f"Transcribing {format_status_path(file_path)} ", end='', flush=True)
|
||||||
elif detected in ('de', 'ru'):
|
markdown = transcribe_file(model, audio_input, language)
|
||||||
langs = [detected]
|
write_markdown(file_path, markdown, postfix)
|
||||||
else:
|
# second file
|
||||||
langs = ['ru']
|
language="de"
|
||||||
|
postfix = "de"
|
||||||
# Collect segments for combined result
|
elif language == 'en': # songs mostly detect as english
|
||||||
combined_segments = []
|
language="de"
|
||||||
for lang in langs:
|
elif language == 'de' or language == 'ru': # keep as detected
|
||||||
print(f"Transcribing {format_status_path(file_path)} as {lang}", end='', flush=True)
|
pass
|
||||||
result = transcribe_file(model, audio_input, lang)
|
else: # not german not english and not russian. --> russina
|
||||||
|
language="ru"
|
||||||
# Insert a synthetic segment for language header
|
|
||||||
combined_segments.append({
|
print(f"Transcribing {format_status_path(file_path)} ", end='', flush=True)
|
||||||
'start': 0,
|
markdown = transcribe_file(model, audio_input, language)
|
||||||
'text': f"\n## Transcription ({lang.upper()})",
|
write_markdown(file_path, markdown, postfix)
|
||||||
})
|
|
||||||
|
|
||||||
# Extend with actual segments
|
|
||||||
if isinstance(result, dict) and 'segments' in result:
|
|
||||||
combined_segments.extend(result['segments'])
|
|
||||||
else:
|
|
||||||
# If result isn't dict-of-segments, wrap entire text
|
|
||||||
text = getattr(result, 'text', None) or (result.get('text') if isinstance(result, dict) else str(result))
|
|
||||||
combined_segments.append({'start': 0, 'text': text})
|
|
||||||
|
|
||||||
# Now write out markdown using the combined segments
|
|
||||||
file_dir = os.path.dirname(file_path)
|
|
||||||
txt_folder = os.path.join(file_dir, "Transkription")
|
|
||||||
os.makedirs(txt_folder, exist_ok=True)
|
|
||||||
base_name = os.path.splitext(os.path.basename(file_path))[0]
|
|
||||||
output_md = os.path.join(txt_folder, base_name + ".md")
|
|
||||||
|
|
||||||
# Build markdown lines
|
|
||||||
folder_name = os.path.basename(file_dir)
|
|
||||||
md_lines = [
|
|
||||||
f"### {folder_name}",
|
|
||||||
f"#### {os.path.basename(file_path)}",
|
|
||||||
"---",
|
|
||||||
""
|
|
||||||
]
|
|
||||||
previous_text = ""
|
|
||||||
for segment in combined_segments:
|
|
||||||
start = format_timestamp(segment.get('start', 0))
|
|
||||||
text = segment.get('text', '').strip()
|
|
||||||
if text and text != previous_text:
|
|
||||||
md_lines.append(f"`{start}` {text}")
|
|
||||||
previous_text = text
|
|
||||||
|
|
||||||
# Join and post-process
|
|
||||||
transcript_md = "\n".join(md_lines)
|
|
||||||
transcript_md = apply_error_correction(transcript_md)
|
|
||||||
transcript_md = remove_lines_with_words(transcript_md)
|
|
||||||
|
|
||||||
# Write file and report
|
|
||||||
with open(output_md, "w", encoding="utf-8") as f:
|
|
||||||
f.write(transcript_md)
|
|
||||||
|
|
||||||
if combined_segments:
|
|
||||||
end_ts = combined_segments[-1].get('end', combined_segments[-1].get('start', 0))
|
|
||||||
print_speed(end_ts)
|
|
||||||
print("... done !")
|
|
||||||
|
|
||||||
|
|
||||||
def process_folder(root_folder):
|
def process_folder(root_folder):
|
||||||
"""
|
"""
|
||||||
@ -240,8 +225,10 @@ def process_folder(root_folder):
|
|||||||
txt_folder = os.path.join(dirpath, "Transkription")
|
txt_folder = os.path.join(dirpath, "Transkription")
|
||||||
base_name = os.path.splitext(os.path.basename(file_path))[0]
|
base_name = os.path.splitext(os.path.basename(file_path))[0]
|
||||||
output_md = os.path.join(txt_folder, base_name + ".md")
|
output_md = os.path.join(txt_folder, base_name + ".md")
|
||||||
|
output_md_de = os.path.join(txt_folder, base_name + "_de.md")
|
||||||
|
output_md_ru = os.path.join(txt_folder, base_name + "_ru.md")
|
||||||
# skip files with existing md files
|
# skip files with existing md files
|
||||||
if os.path.exists(output_md):
|
if os.path.exists(output_md) or os.path.exists(output_md_de) or os.path.exists(output_md_ru):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
valid_files.append(file_path)
|
valid_files.append(file_path)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user