From a67176e137db51df10d99749b69084f8d60bf865 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B0=B7=E6=B2=A2=E9=9D=96=E6=B4=8B?= <tanizawa@yt.local>
Date: Mon, 11 Jul 2022 18:30:54 +0900
Subject: [PATCH] fix_partial_CDS function impremented

fix pseudo-partial CDS predicted at the end of the contig, but is likely to be intact.
The one that meet the following conditions will be retained
    - Starts at the coordinate 1 (+ strand) or at the end of contig (- strand)
    - 5'-end is missing and 3'-end is intact (+ strand) or the opposited case (- strand)
    - length is multiple of 3 and > min_length
    - the CDS has significant protein hit.

partial_flag 10, strand +, len>=500
partial_flag 01, strand -, len>=500

if the first 3 nuleotides are in [ATG, GTG, TTG], the CDS is fixed as intact. Otherwise, it is annotated as misc_feature
---
 dfc/__init__.py                  |  2 +-
 dfc/models/hit.py                | 15 +++---
 dfc/pipeline.py                  |  1 +
 dfc/utils/feature_util.py        | 89 +++++++++++++++++++++++++++++---
 dfc/utils/locus_tag_generator.py |  2 +
 dfc_docker/Dockerfile            |  2 +-
 6 files changed, 95 insertions(+), 16 deletions(-)

diff --git a/dfc/__init__.py b/dfc/__init__.py
index de21e04..6afad73 100644
--- a/dfc/__init__.py
+++ b/dfc/__init__.py
@@ -1 +1 @@
-dfast_version = "1.2.17"
+dfast_version = "1.2.18"
diff --git a/dfc/models/hit.py b/dfc/models/hit.py
index 4c2e204..b4277d4 100644
--- a/dfc/models/hit.py
+++ b/dfc/models/hit.py
@@ -50,13 +50,16 @@ def __str__(self):
         return ret
 
     def assign(self, feature, verbosity=2):
-        feature.qualifiers["product"] = [self.description]
+        if feature.type == "CDS":
+            # ignored for misc_feature
+            feature.qualifiers["product"] = [self.description]
+            if self.gene:
+                feature.qualifiers["gene"] = [self.gene]
+            if self.ec_number:
+                feature.qualifiers["EC_number"] = [x.strip() for x in self.ec_number.split(",")]
+            feature.qualifiers.setdefault("inference", []).append(self.get_inference())
+
         # print("DEBUG setting product", self.description)
-        if self.gene:
-            feature.qualifiers["gene"] = [self.gene]
-        if self.ec_number:
-            feature.qualifiers["EC_number"] = [x.strip() for x in self.ec_number.split(",")]
-        feature.qualifiers.setdefault("inference", []).append(self.get_inference())
         # todo modify note qualifier into appropriate one
         feature.qualifiers.setdefault("note", []).extend(self.notes)
         if verbosity >= 2:
diff --git a/dfc/pipeline.py b/dfc/pipeline.py
index bfb6dfd..3b5aacb 100644
--- a/dfc/pipeline.py
+++ b/dfc/pipeline.py
@@ -52,6 +52,7 @@ def execute(self):
         self.sa.execute()  # execute structural annotation
         self.fu.execute()  # feature adjustment: sort, remove_partial, (merge)
         self.fa.execute()  # functional annotation
+        self.fu.execute_remove_partial()  # feature adjustment remove partial
         self.ltg.execute()  # assigning locus_tags
         self.genome.add_source_features()  # set source feature
 
diff --git a/dfc/utils/feature_util.py b/dfc/utils/feature_util.py
index a6d1c30..f530ce6 100644
--- a/dfc/utils/feature_util.py
+++ b/dfc/utils/feature_util.py
@@ -59,7 +59,7 @@ def _resolve_overlap(seq_record, threshold=10):
             seq_record.features = [feature for feature in seq_record.features if feature.id not in removed]
             for feature_id in removed:
                 feature = self.genome.features[feature_id]
-                self.logger.debug("Removing overlapping feature: {} in {}".format(feature.id, feature.seq_id))
+                self.logger.debug("Removing overlapping feature: {} at {}:{}".format(feature.id, feature.seq_id, str(feature.location)))
             return len(removed)
 
         count_removed = 0
@@ -69,19 +69,26 @@ def _resolve_overlap(seq_record, threshold=10):
         self.logger.info("Removed {} overlapping features.".format(count_removed))
 
     def remove_partial_features(self):
-        exempted_features = ["CRISPR"]
+        exempted_features = ["CRISPR", "misc_feature"]
         removed = []
 
         for feature in self.genome.features.values():
-            # if hasattr(feature, "location"):
-            #     print(feature, feature.id)
-            #     continue
+            seq = self.genome.seq_records[feature.seq_id].seq
+            if feature.location.start < 10 or feature.location.end > len(seq) - 10:
+                # print(type(seq))
+                # print(feature)
+                # print(feature.annotations)
+                # print(self.genome.seq_records)
+                self.fix_partial_CDS(feature)
+                # print(feature)
+                # print(feature.annotations)
             if feature.type in exempted_features:
                 continue
             start = feature.location.start
             end = feature.location.end
             if isinstance(start, BeforePosition) or isinstance(end, AfterPosition):
-                self.logger.debug("Removing partial feature: {} in {}".format(feature.id, feature.seq_id))
+                self.logger.debug("Removing partial feature: {} at {}:{}".format(feature.id, feature.seq_id, str(feature.location)))
+                # temporary disabled for dev
                 removed.append(feature.id)
         for seq_record in self.genome.seq_records.values():
             seq_record.features = [feature for feature in seq_record.features if feature.id not in removed]
@@ -171,8 +178,8 @@ def _choose_representative_location(features):
         self.logger.info("Merged CDS features. {} CDSs in total.".format(cnt))
 
     def execute(self):
-        if self.enable_remove_partial:
-            self.remove_partial_features()
+        # if self.enable_remove_partial:
+        #     self.remove_partial_features()
 
         if self.enable_remove_overlapping:
             self.resolve_overlap()
@@ -180,6 +187,72 @@ def execute(self):
         if self.enable_merge_cds:
             self.merge_cds()
 
+    def execute_remove_partial(self):
+        if self.enable_remove_partial:
+            self.remove_partial_features()
+
+
+    def fix_partial_CDS(self, feature, min_length=500):
+        """
+        fix pseudo-partial CDS predicted at the end of the contig, but is likely to be intact.
+        The one that meet the following conditions will be retained
+            Starts at the coordinate 1 (+ strand) or at the end of contig (- strand)
+            5'-end is missing and 3'-end is intact (+ strand) or the opposited case (- strand) 
+            length is multiple of 3 and > min_length
+            the CDS has significant protein hit.
+
+        partial_flag 10, strand +, len>=500
+        partial_flag 01, strand -, len>=500
+
+        if the first 3 nuleotides are in [ATG, GTG, TTG], the CDS is fixed as intact,
+        otherwise, it is annotated as misc_feature
+        """
+        def _fix_partial(feature, seq):
+            self.logger.warning("Fixed partial CDS predicted at %s:%s", feature.seq_id, str(feature.location))
+            if feature.strand == 1:
+                feature.location = FeatureLocation(ExactPosition(0), feature.location.end, feature.strand)
+            else:
+                feature.location = FeatureLocation(feature.location.start, ExactPosition(len(seq)), feature.strand)
+            feature.annotations["partial_flag"] = "00"
+            if "partial" in feature.annotations:
+                del feature.annotations["partial"]
+
+        def _to_misc_feature(feature, protein_hit):
+            if not feature.type == "CDS":
+                return  # Do nothing for features other than CDS
+            self.logger.warning("Partial CDS predicted at %s:%s will be annotated as misc_feature", feature.seq_id, str(feature.location))
+            feature.type = "misc_feature"
+            note = f"partial CDS similar to {protein_hit.id}:{protein_hit.description}"
+            feature.qualifiers.setdefault("note", []).append(note)
+            for key in ["product", "translation", "transl_table", "codon_start"]:
+                if key in feature.qualifiers:
+                    del feature.qualifiers[key]
+
+        def _get_hit_description(feature):
+            if feature.primary_hit:
+                return feature.primary_hit
+            elif feature.secondary_hits:
+                return feature.secondary_hits[0]
+            else:
+                return None
+
+        acceptable_codons = ["ATG", "GTG", "TTG"]
+        seq = self.genome.seq_records[feature.seq_id].seq
+        protein_hit = _get_hit_description(feature)
+        if int(feature.location.start) == 0 and feature.strand == 1 and feature.annotations.get("partial_flag", "00") == "10" and len(feature) >= min_length:
+            # case: fix left partial CDS (<1..##) to intact CDS
+            first3 =  str(seq[:3]).upper()
+            if first3 in acceptable_codons and len(feature) % 3 == 0 and protein_hit:
+                _fix_partial(feature, seq)
+            elif protein_hit and protein_hit.description != "hypothetical protein":
+                _to_misc_feature(feature, protein_hit)
+        elif int(feature.location.end) == len(seq) and feature.strand == -1 and feature.annotations.get("partial_flag", "00") == "01" and len(feature) >= min_length:
+            # case: fix right partial CDS (##..>##)　to intact CDS
+            first3 =  str(seq[-3:].reverse_complement()).upper()
+            if first3 in acceptable_codons and len(feature) % 3 == 0 and protein_hit:
+                _fix_partial(feature, seq)
+            elif protein_hit and protein_hit.description != "hypothetical protein":
+                _to_misc_feature(feature, protein_hit)
 
 if __name__ == '__main__':
     pass
diff --git a/dfc/utils/locus_tag_generator.py b/dfc/utils/locus_tag_generator.py
index e580c54..464c808 100644
--- a/dfc/utils/locus_tag_generator.py
+++ b/dfc/utils/locus_tag_generator.py
@@ -34,6 +34,8 @@ def execute(self):
         digit = 1 if len(self.genome.features) == 0 else int(math.log10(len(self.genome.features) * self.step)) + 1
         for feature in self.genome.features.values():
             type_ = feature.type
+            if type_ == "misc_feature":
+                type_ = "CDS"   # temporarily treated as CDS
             if type_ in self.symbols:
                 counts[type_] += 1
                 count += 1
diff --git a/dfc_docker/Dockerfile b/dfc_docker/Dockerfile
index 36488fa..e89e533 100644
--- a/dfc_docker/Dockerfile
+++ b/dfc_docker/Dockerfile
@@ -25,7 +25,7 @@ RUN cd /tmp  && \
 	rm -r /tmp/tRNAscan-SE-2.0.6 /tmp/v2.0.6.tar.gz && \
 	cd /work
 
-
+ENV INCREMENT_THIS_TO_BUMP 1
 # Install dfast_core
 RUN cd / && \
     git clone https://github.com/nigyta/dfast_core && \