segment between languages
This commit is contained in:
parent
a7effaec8f
commit
fde07e3830
@ -147,7 +147,7 @@ def process_file(file_path, model, audio_input):
|
||||
# Determine which languages to transcribe
|
||||
if (detected == 'ru' and 'predigt' in file_name.lower()) or \
|
||||
(detected == 'de' and 'russisch' in file_name.lower()):
|
||||
langs = ['ru', 'de']
|
||||
langs = ['de', 'ru']
|
||||
elif detected == 'en': # songs often mis-detected as English
|
||||
langs = ['de']
|
||||
elif detected in ('de', 'ru'):
|
||||
@ -156,17 +156,12 @@ def process_file(file_path, model, audio_input):
|
||||
langs = ['ru']
|
||||
|
||||
# Collect segments for combined result
|
||||
combined_segments = []
|
||||
lang_collection = {}
|
||||
for lang in langs:
|
||||
combined_segments = []
|
||||
print(f"Transcribing {format_status_path(file_path)} as {lang}", end='', flush=True)
|
||||
result = transcribe_file(model, audio_input, lang)
|
||||
|
||||
# Insert a synthetic segment for language header
|
||||
combined_segments.append({
|
||||
'start': 0,
|
||||
'text': f"\n## Transcription ({lang.upper()})",
|
||||
})
|
||||
|
||||
# Extend with actual segments
|
||||
if isinstance(result, dict) and 'segments' in result:
|
||||
combined_segments.extend(result['segments'])
|
||||
@ -174,6 +169,7 @@ def process_file(file_path, model, audio_input):
|
||||
# If result isn't dict-of-segments, wrap entire text
|
||||
text = getattr(result, 'text', None) or (result.get('text') if isinstance(result, dict) else str(result))
|
||||
combined_segments.append({'start': 0, 'text': text})
|
||||
lang_collection[lang] = combined_segments
|
||||
|
||||
# Now write out markdown using the combined segments
|
||||
file_dir = os.path.dirname(file_path)
|
||||
@ -191,12 +187,15 @@ def process_file(file_path, model, audio_input):
|
||||
""
|
||||
]
|
||||
previous_text = ""
|
||||
for segment in combined_segments:
|
||||
start = format_timestamp(segment.get('start', 0))
|
||||
text = segment.get('text', '').strip()
|
||||
if text and text != previous_text:
|
||||
md_lines.append(f"`{start}` {text}")
|
||||
previous_text = text
|
||||
for lang, combined_segments in lang_collection.items():
|
||||
md_lines.append(f"##### Transcription ({lang.upper()})")
|
||||
md_lines.append("---")
|
||||
for segment in combined_segments:
|
||||
start = format_timestamp(segment.get('start', 0))
|
||||
text = segment.get('text', '').strip()
|
||||
if text and text != previous_text:
|
||||
md_lines.append(f"`{start}` {text}")
|
||||
previous_text = text
|
||||
|
||||
# Join and post-process
|
||||
transcript_md = "\n".join(md_lines)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user