post processing and text_line extraction

debiff · Dec 6, 2017 · 0f9bec8 · 0f9bec8
1 parent 67b4346
commit 0f9bec8
Show file tree

Hide file tree

Showing 11 changed files with 222 additions and 27 deletions.
diff --git a/MHS/classes/region.py b/MHS/classes/region.py
@@ -200,7 +200,11 @@ def _draw(self, type):
             fitted_contours = np.copy(c.contour)
             fitted_contours[:, :, 0] -= self.xmin
             fitted_contours[:, :, 1] -= self.ymin
-            cv2.drawContours(pixels, [fitted_contours], -1, 0, cv2.FILLED)
+            if (type == 'total' or type == 'non-text') and len(c.inner_components.as_list()) > 1 \
+                    and (c.bb_width > (self.xmax - self.xmin) / 2 or c.bb_height >(self.ymax - self.ymin) / 2):
+                cv2.drawContours(pixels, [fitted_contours], -1, 0, 1)
+            else:
+                cv2.drawContours(pixels, [fitted_contours], -1, 0, cv2.FILLED)
         return pixels
 
     def save(self, path, type):

diff --git a/MHS/recursive_filter.py b/MHS/recursive_filter.py
@@ -229,14 +229,15 @@ def recursive_splitting(node, direction, region_collector):
     if homogeneity(node, direction):
         return
     split(node, direction, region_collector)
-    for leaves in region_collector.region_tree.leaves(node.identifier):
-        if leaves.identifier != node.identifier:
-            recursive_splitting(leaves, direction, region_collector)
+    leaves = region_collector.region_tree.leaves(node.identifier)
+    for leaf in leaves:
+        if leaf.identifier != node.identifier:
+            recursive_splitting(leaf, direction, region_collector)
 
 
 def multilevel_classification(node, region_collector):
     recursive_splitting(node, 'vertical', region_collector)
 
     vertical_nodes = region_collector.region_tree.leaves(node.identifier)
-    for leaves in vertical_nodes:
-        recursive_splitting(leaves, 'horizontal', region_collector)
+    for leaf in vertical_nodes:
+        recursive_splitting(leaf, 'horizontal', region_collector)
diff --git a/helper/component.py b/helper/component.py
@@ -195,7 +195,7 @@ def draw_rect_from_list(img, rect_list):
 
 
 def draw_rect(img, xmin, ymin, xmax, ymax, color):
-    cv2.rectangle(img, (xmin,ymin),(xmax,ymax), color, 1)
+    cv2.rectangle(img, (xmin,ymin),(xmax,ymax), color, 2)
     #cv2.imwrite('./samples/result.png', img)
 
 

diff --git a/helper/document.py b/helper/document.py
@@ -0,0 +1,3 @@
+def subtract(text_document, total_document):
+    non_text_document = total_document - text_document
+    return non_text_document
diff --git a/manager/filter.py b/manager/filter.py
@@ -88,29 +88,35 @@ def recursive_filter(region_collector):
     while homogeneous_region_extraction:
         white_space_analysis = False
         homogeneous_region_extraction = False
-        for leaves in region_collector.region_tree.leaves(region_collector.region_tree.root):
-            if leaves.identifier not in clean_leaves:
+        leaves = region_collector.region_tree.leaves(region_collector.region_tree.root)
+        for leaf in leaves:
+            if leaf.identifier not in clean_leaves:
                 white_space_analysis = True
-                multilevel_classification(leaves, region_collector)
+                multilevel_classification(leaf, region_collector)
 
         if white_space_analysis:
-            for leaves in region_collector.region_tree.leaves(region_collector.region_tree.root):
-                if leaves.identifier not in clean_leaves:
-                    region_changed = True
-                    while region_changed:
-                        region_changed = False
-                        if maximum_median(leaves.data):
+            leaves = region_collector.region_tree.leaves(region_collector.region_tree.root)
+            for leaf in leaves:
+                if leaf.identifier not in clean_leaves:
+                    region_changed = False
+                    continue_analysis = True
+                    while continue_analysis:
+                        continue_analysis = False
+                        if maximum_median(leaf.data):
+                            continue_analysis = True
                             region_changed = True
                             homogeneous_region_extraction = True
-                            leaves.data.included.max_area_component.type = 'non_text'
-                            leaves.data.included.manually_clear_cache()
-                        elif minimum_median(leaves.data):
+                            leaf.data.included.max_area_component.type = 'non_text'
+                            leaf.data.included.manually_clear_cache()
+                            continue
+                        if minimum_median(leaf.data):
+                            continue_analysis = True
                             region_changed = True
                             homogeneous_region_extraction = True
-                            leaves.data.included.min_area_component.type = 'non_text'
-                            leaves.data.included.manually_clear_cache()
-                    if homogeneous_region_extraction and len(leaves.data.included.text_component().as_list()) > 0:
-                        leaves.data.manually_clear_cache()
+                            leaf.data.included.min_area_component.type = 'non_text'
+                            leaf.data.included.manually_clear_cache()
+                    if region_changed and len(leaf.data.included.text_component().as_list()) > 0:
+                        leaf.data.manually_clear_cache()
                     else:
-                        clean_leaves.append(leaves.identifier)
+                        clean_leaves.append(leaf.identifier)
 
diff --git a/manager/post_processing.py b/manager/post_processing.py
@@ -0,0 +1,56 @@
+from postProcessing.classes.line import Line
+from postProcessing.classes.line_collector import LineCollector
+
+
+def is_near(comp_left, comp_right):
+    distance_height = abs(comp_right.xmin - comp_left.xmax) <= 1.2 * max(comp_left.bb_height, comp_right.bb_height)
+    if distance_height:
+        return True
+    return False
+
+
+def create_line(comp_list):
+    x_min = min(c.xmin for c in comp_list)
+    x_max = max(c.xmax for c in comp_list)
+    y_min = min(c.ymin for c in comp_list)
+    y_max = max(c.ymax for c in comp_list)
+
+    return Line(x_min, y_min, x_max, y_max)
+
+
+def find_lines(text_component):
+    lines = []
+    for c in text_component.as_list():
+        if not (any(c in l for l in lines)):
+            line = []
+            same_row = c.same_row.as_list()
+            line.extend(comp for comp in same_row if comp.type == 'text')
+            if len(line) != 0:
+                if c not in line:
+                    line.append(c)
+                sorted_l = sorted(line, key=lambda comp: comp.xmin)
+                lines.append(sorted_l)
+    return lines
+
+
+def text_segmentation(text_tree):
+    l_collector = LineCollector()
+    component_collector = text_tree.included.text_component()
+    component_lines = find_lines(component_collector)
+    chains = []
+    for line in component_lines:
+        chain_same_line = []
+        chain = [line[0]]
+        for c_id in range(len(line) - 1):
+            if is_near(line[c_id], line[c_id + 1]):
+                chain.append(line[c_id + 1])
+            else:
+                chain_same_line.append(create_line(chain))
+                l_collector.add_line(create_line(chain))
+                chain = [line[c_id + 1]]
+            if c_id + 1 == len(line) - 1:
+                chain_same_line.append(create_line(chain))
+                l_collector.add_line(create_line(chain))
+        chains.append(chain_same_line)
+
+    return l_collector
diff --git a/postProcessing/classes/line.py b/postProcessing/classes/line.py
@@ -0,0 +1,23 @@
+
+class Line:
+    def __init__(self, x_min, y_min, x_max, y_max):
+        self._xmin = x_min
+        self._ymin = y_min
+        self._xmax = x_max
+        self._ymax = y_max
+
+    @property
+    def xmin(self):
+        return self._xmin
+
+    @property
+    def ymin(self):
+        return self._ymin
+
+    @property
+    def xmax(self):
+        return self._xmax
+
+    @property
+    def ymax(self):
+        return self._ymax
diff --git a/postProcessing/classes/line_collector.py b/postProcessing/classes/line_collector.py
@@ -0,0 +1,47 @@
+import numpy as np
+from postProcessing.classes.line import Line
+
+class LineCollector:
+
+    def __init__(self):
+        self._line_list = []
+        self._cached_matrix = None
+
+    def add_line(self, line):
+        if len(self._line_list) > 0:
+            overlapped = self.overlap(line)
+            if len(overlapped) > 0:
+                line = self.unify(overlapped, line)
+        self._line_list.append(line)
+        self._cached_matrix = None
+
+    def as_matrix(self):
+        if self._cached_matrix is not None:
+            return self._cached_matrix
+
+        self._cached_matrix = np.array([[v.xmin, v.ymin, v.xmax, v.ymax] for v in self._line_list])
+        return self._cached_matrix
+
+    def as_list(self):
+        return self._line_list
+
+    def overlap(self, line):
+        xmin_less_linemin = self.as_matrix()[:, 0] <= line.xmin
+        linemin_less_xmax = line.xmin <= self.as_matrix()[:, 2]
+        and_1_2 = np.bitwise_and(xmin_less_linemin, linemin_less_xmax)
+
+        linemin_less_xmin = line.xmin <= self.as_matrix()[:, 0]
+        xmin_less_linemax = self.as_matrix()[:, 0] <= line.xmax
+        and_3_4 = np.bitwise_and(linemin_less_xmin, xmin_less_linemax)
+
+        same_row = (np.maximum(self.as_matrix()[:, 1], line.ymin) - np.minimum(self.as_matrix()[:, 3], line.ymax))  < 0
+        return np.where(np.bitwise_and(np.bitwise_or(and_1_2, and_3_4), same_row))[0].tolist()
+
+    def unify(self, overlapped, line):
+        x_min = min(line.xmin, min(self.as_list()[i].xmin for i in overlapped))
+        x_max = max(line.xmax, max(self.as_list()[i].xmax for i in overlapped))
+        y_min = min(line.ymin, min(self.as_list()[i].ymin for i in overlapped))
+        y_max = max(line.ymax, max(self.as_list()[i].ymax for i in overlapped))
+        for index in sorted(overlapped, reverse=True):
+            del self._line_list[index]
+        return Line(x_min, y_min, x_max, y_max)
diff --git a/postProcessing/classes/paragraph.py b/postProcessing/classes/paragraph.py
diff --git a/postProcessing/classes/paragraph_collector.py b/postProcessing/classes/paragraph_collector.py
diff --git a/test_class.py b/test_class.py
@@ -8,12 +8,21 @@
 from MHS.classes.region_collector import RegionCollector
 import numpy as np
 import manager.filter as component_filter
+import math
+from manager.post_processing import text_segmentation
 
 timer = datetime.now()
 region_collector = RegionCollector()
-img, gray = image.load_and_gray('./samples/0001.jpg')
+img, gray = image.load_and_gray('./samples/icdar.jpg')
 binary = image.binarize(gray)
 
+
+# num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(binary, 4, cv2.CV_32S)
+# labels[labels == 385] = 10000
+# labels[labels < 10000] = 255
+# labels[labels == 10000] = 0
+#
+# cv2.imwrite('./samples/split/labeled.png', labels)
 contours, hierarchy = component.find_component(binary)
 
 print((datetime.now()-timer))
@@ -25,11 +34,57 @@
 
 region_collector.add_region(document)
 
-
-a = [x for x in region_collector.region_tree.get_node(region_collector.region_tree.root).data.included.text_component().as_list() if x.type == 'non_text']
 component_filter.recursive_filter(region_collector)
 
+region_collector.region_tree.get_node(region_collector.region_tree.root).data.included.manually_clear_cache()
+region_collector.region_tree.get_node(region_collector.region_tree.root).data.manually_clear_cache()
+
+
+def remove_text(document, bin):
+    for component in document.included.text_component().as_list():
+        bin[component.ymin:component.ymax, component.xmin:component.xmax] = 255
+    return bin
+
+
+def fill_non_text(non_text_image, non_text_image_bb):
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (math.ceil(non_text_image.shape[0] * 0.005),
+                                                        math.ceil(non_text_image.shape[1] * 0.005)))
+    res = cv2.morphologyEx(np.bitwise_not(non_text_image_bb), cv2.MORPH_DILATE, kernel)
+    res = np.bitwise_not(res)
+    res[non_text_image == 1] = 0
+    return res
+
+
+def compare_text_non_text(document, filled):
+    for component in document.included.text_component().as_list():
+        sub_image = filled[component.ymin:component.ymax, component.xmin:component.xmax]
+        where = np.where(sub_image == 0)
+        if len(where[0]) + len(where[1]) > 0:
+            component.type = 'non_text'
+
+
+def post_processing(document, bin):
+    non_text = remove_text(document, bin)
+    filled_non_text = fill_non_text(document.bin_pixel('non-text'), non_text)
+    compare_text_non_text(document, filled_non_text)
+
+post_processing(region_collector.region_tree.get_node(region_collector.region_tree.root).data, binary)
 
 region_collector.region_tree.get_node(region_collector.region_tree.root).data.included.manually_clear_cache()
 region_collector.region_tree.get_node(region_collector.region_tree.root).data.manually_clear_cache()
+
+text_tree = RegionCollector()
+text_tree.add_region(region_collector.region_tree.get_node(region_collector.region_tree.root).data)
+
+t_lines = text_segmentation(text_tree.region_tree.get_node(text_tree.region_tree.root).data)
+
+
+
+
+
+for l in t_lines.as_list():
+    component.draw_rect(img, l.xmin, l.ymin, l.xmax, l.ymax, (255,0,0))
+cv2.imwrite('./samples/split/line.png', img)
 region_collector.region_tree.get_node(region_collector.region_tree.root).data.save('./samples/split/root.png', 'text')
+region_collector.region_tree.get_node(region_collector.region_tree.root).data.save('./samples/split/root_non_text.png', 'non-text')
+print((datetime.now()-timer))