from transformers import (
AutomaticSpeechRecognitionPipeline,
AutoFeatureExtractor,
Wav2Vec2ForCTC,
AutoTokenizer
)
pretrained_model_name_or_path = "imvladikon/wav2vec2-xls-r-300m-hebrew"
asr = AutomaticSpeechRecognitionPipeline(
feature_extractor=AutoFeatureExtractor.from_pretrained(
pretrained_model_name_or_path
),
model=Wav2Vec2ForCTC.from_pretrained(
pretrained_model_name_or_path
),
tokenizer=AutoTokenizer.from_pretrained(
pretrained_model_name_or_path
))
filename = "audio.wav"
print(asr(filename))
Chunking file into smaller chunks is not implemented yet.
pip install git+https://github.com/imvladikon/wav2vec2-hebrew
from wav2vec2_hebrew import HebrewSpeechRecognitionPipeline
asr = HebrewSpeechRecognitionPipeline()
filename = "./samples/bereshit011.wav"
output = asr(filename)
print(output)
# [{'text': 'בראשית ברא אלוהים את השמייים ואת הארץ'}]
import torchaudio
from wav2vec2_hebrew import HebrewWav2Vec2Aligner
filename = "./samples/bereshit011.wav"
text = "בראשית ברא אלוהים את השמיים ואת הארץ"
aligner = HebrewWav2Vec2Aligner(input_sample_rate=16000, use_cuda=True)
# aligning segments to text (sentences)
first_sentence = aligner.align_data(filename, text)[0]
# {'sentence': 'בראשית ברא אלוהים את השמיים ואת הארץ',
# 'segments': [Segment(label='בראשית', start=6750.516853932584, end=18644.284644194755, score=0.16025335497152965)...]}
# showing in IPython (notebook)
waveform, sample_rate = torchaudio.load(filename)
aligner.show_segments(waveform, first_sentence)
# showing segments using IPython.display.Audio
Training logs and details are available in the train folder.