Bug fixes (#790)

MontrealCorpusTools · Apr 11, 2024 · 7611ddd · 7611ddd
1 parent c4ae60b
commit 7611ddd
Show file tree

Hide file tree

Showing 5 changed files with 20 additions and 9 deletions.
diff --git a/docs/source/changelog/changelog_3.0.rst b/docs/source/changelog/changelog_3.0.rst
@@ -5,6 +5,14 @@
 3.0 Changelog
 *************
 
+3.0.4
+=====
+
+- Fixed issue with github token set in the environment not being respected
+- Changed ordering of g2p output from corpora to be based on word frequency rather than alphabetical
+- Changed duration deviation to save the max z-scored duration, rather than be the average over all phones
+- Update default punctuation markers to cover Arabic script punctuation
+
 3.0.3
 =====
 

diff --git a/montreal_forced_aligner/alignment/multiprocessing.py b/montreal_forced_aligner/alignment/multiprocessing.py
@@ -682,7 +682,7 @@ def _run(self):
                 for k, m, sd in session.query(
                     Phone.id, Phone.mean_duration, Phone.sd_duration
                 ).filter(
-                    Phone.phone_type.in_([PhoneType.non_silence, PhoneType.oov]),
+                    Phone.phone_type == PhoneType.non_silence,
                     Phone.sd_duration != None,  # noqa
                     Phone.sd_duration != 0,
                 )
@@ -705,13 +705,15 @@ def _run(self):
                     continue
                 interval_count = len(phone_intervals)
                 log_like_sum = 0
-                duration_zscore_sum = 0
+                duration_zscore_max = 0
                 for pi in phone_intervals:
                     log_like_sum += pi.phone_goodness
                     m, sd = phones[pi.phone_id]
-                    duration_zscore_sum += abs((pi.duration - m) / sd)
+                    duration_zscore = abs((pi.duration - m) / sd)
+                    if duration_zscore > duration_zscore_max:
+                        duration_zscore_max = duration_zscore
                 utterance_speech_log_likelihood = log_like_sum / interval_count
-                utterance_duration_deviation = duration_zscore_sum / interval_count
+                utterance_duration_deviation = duration_zscore_max
                 self.callback(
                     (utterance.id, utterance_speech_log_likelihood, utterance_duration_deviation)
                 )

diff --git a/montreal_forced_aligner/db.py b/montreal_forced_aligner/db.py
@@ -1064,6 +1064,7 @@ class File(MfaSqlBase):
         order_by="Utterance.begin",
         collection_class=ordering_list("begin"),
         cascade="all, delete",
+        cascade_backrefs=False,
     )
 
     @property
@@ -1424,8 +1425,8 @@ class Utterance(MfaSqlBase):
         index=True,
     )
     job_id = Column(Integer, ForeignKey("job.id"), index=True, nullable=True)
-    file = relationship("File", back_populates="utterances")
-    speaker = relationship("Speaker", back_populates="utterances")
+    file = relationship("File", back_populates="utterances", cascade_backrefs=False)
+    speaker = relationship("Speaker", back_populates="utterances", cascade_backrefs=False)
     job = relationship("Job", back_populates="utterances")
     phone_intervals = relationship(
         "PhoneInterval",

diff --git a/montreal_forced_aligner/dictionary/mixins.py b/montreal_forced_aligner/dictionary/mixins.py
@@ -18,9 +18,9 @@
 if TYPE_CHECKING:
     from montreal_forced_aligner.abc import MetaDict
 
-DEFAULT_PUNCTUATION = list(r'、。।，？！!@<>→"”()“„–,.:;—¿?¡：）!\\&%#*~【】，…‥「」『』〝〟″⟨⟩♪・‚‘‹›«»～′$+=‘۔')
+DEFAULT_PUNCTUATION = list(r'、。।，？！!@<>→"”()“„–,.:;—¿?¡：）؟!\\&%#*،~【】，…‥「」『』〝〟″⟨⟩♪・‚‘‹›«»～′$+=‘۔')
 
-DEFAULT_WORD_BREAK_MARKERS = list(r'？！!()，,.:;¡¿?“„"”&~%#—…‥、。【】$+=〝〟″‹›«»・⟨⟩「」『』')
+DEFAULT_WORD_BREAK_MARKERS = list(r'？！!()，,.:;¡¿?“„"”&~%#—…‥、。【】$+=〝〟″‹›«»・⟨⟩،「」『』؟')
 
 DEFAULT_QUOTE_MARKERS = list("“„\"”〝〟″「」『』‚ʻʿ‘′'")
 

diff --git a/montreal_forced_aligner/g2p/generator.py b/montreal_forced_aligner/g2p/generator.py
@@ -1053,7 +1053,7 @@ def words_to_g2p(self) -> List[str]:
                 query = (
                     session.query(Word.word)
                     .filter(Word.word_type == WordType.oov, Word.word != self.oov_word)
-                    .order_by(Word.word)
+                    .order_by(Word.count.desc())
                 )
             self._word_list = [x for x, in query]
         return self._word_list