-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtransliterate.py
38 lines (28 loc) · 891 Bytes
/
transliterate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from ai4bharat.transliteration import XlitEngine
from indicnlp.tokenize import indic_tokenize
import pandas as pd
b = XlitEngine("brx", beam_width=5, rescore=False)
# Roman Script to Devanagari
def transliterate(lyrics_list):
# Convert list to data frame
lyrics_df = pd.Series(lyrics_list, index=None)
# print(lyrics_df)
tokenized_df = lyrics_df.apply(sentence_english_tokenize)
# print(tokenized_df)
devanagarized_df = tokenized_df.apply(devanagarized)
return devanagarized_df.tolist()
# Bad naming
def devanagarized(x):
l = []
for t in x.split():
o = b.translit_word(t, topk=5)
l.append(o['brx'][0])
x = ' '.join(l)
return x
def sentence_english_tokenize(x):
# english sentence tokenize
l = []
for t in indic_tokenize.trivial_tokenize(x):
l.append(t)
x = ' '.join(l)
return x