Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DISCUSSION] Py3k simple #114

Closed
wants to merge 12 commits into from
8 changes: 5 additions & 3 deletions ocrolib/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import absolute_import, division, print_function

__all__ = [
"binnednn","cairoextras","common","components","dbtables",
"fgen","gmmtree","gtkyield","hocr","lang","native",
Expand All @@ -8,6 +10,6 @@
### top level imports
################################################################

import default
from common import *
from default import traceback as trace
from . import default
from .common import *
from .default import traceback as trace
30 changes: 16 additions & 14 deletions ocrolib/chars.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
# -*- encoding: utf-8 -*-

from __future__ import absolute_import, division, print_function

import re

# common character sets

digits = u"0123456789"
letters = u"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
symbols = ur"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
symbols = u"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
ascii = digits+letters+symbols

xsymbols = u"""€¢£»«›‹÷©®†‡°∙•◦‣¶§÷¡¿▪▫"""
Expand Down Expand Up @@ -58,28 +60,28 @@

def requote(s):
s = unicode(s)
s = re.sub(ur"''",u'"',s)
s = re.sub(r"''",u'"',s)
return s

def requote_fancy(s,germanic=0):
s = unicode(s)
if germanic:
# germanic quoting style reverses the shapes
# straight double quotes
s = re.sub(ur"\s+''",u"”",s)
s = re.sub(u"''\s+",u"“",s)
s = re.sub(ur"\s+,,",u"„",s)
s = re.sub(r"\s+''",u"”",s)
s = re.sub(r"''\s+",u"“",s)
s = re.sub(r"\s+,,",u"„",s)
# straight single quotes
s = re.sub(ur"\s+'",u"’",s)
s = re.sub(ur"'\s+",u"‘",s)
s = re.sub(ur"\s+,",u"‚",s)
s = re.sub(r"\s+'",u"’",s)
s = re.sub(r"'\s+",u"‘",s)
s = re.sub(r"\s+,",u"‚",s)
else:
# straight double quotes
s = re.sub(ur"\s+''",u"“",s)
s = re.sub(ur"''\s+",u"”",s)
s = re.sub(ur"\s+,,",u"„",s)
s = re.sub(r"\s+''",u"“",s)
s = re.sub(r"''\s+",u"”",s)
s = re.sub(r"\s+,,",u"„",s)
# straight single quotes
s = re.sub(ur"\s+'",u"‘",s)
s = re.sub(ur"'\s+",u"’",s)
s = re.sub(ur"\s+,",u"‚",s)
s = re.sub(r"\s+'",u"‘",s)
s = re.sub(r"'\s+",u"’",s)
s = re.sub(r"\s+,",u"‚",s)
return s
86 changes: 44 additions & 42 deletions ocrolib/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
### common functions for data structures, file name manipulation, etc.
################################################################

from __future__ import absolute_import, division, print_function

import os,os.path
import re
import numpy
Expand All @@ -13,14 +15,13 @@
import glob
from numpy import *
from scipy.ndimage import morphology
import ligatures
import multiprocessing
import lstm
import pylab

from pylab import imshow
import morph
from toplevel import *

from . import ligatures
from . import morph
from .toplevel import *

################################################################
### exceptions
Expand Down Expand Up @@ -89,7 +90,7 @@ def _wrap(f):
warned = 0
def _wrapper(*args,**kw):
if not warned:
print f,"has been DEPRECATED"
print(f, "has been DEPRECATED")
warned = 1
return f(*args,**kw)
return _wrap
Expand All @@ -100,7 +101,7 @@ def _wrapper(*args,**kw):
# text normalization
################################################################

import chars
from . import chars
replacements = chars.replacements

def normalize_text(s):
Expand All @@ -109,10 +110,10 @@ def normalize_text(s):
characters."""
s = unicode(s)
s = unicodedata.normalize('NFC',s)
s = re.sub(ur'\s+(?u)',' ',s)
s = re.sub(ur'\n(?u)','',s)
s = re.sub(ur'^\s+(?u)','',s)
s = re.sub(ur'\s+$(?u)','',s)
s = re.sub(r'\s+(?u)',' ',s)
s = re.sub(r'\n(?u)','',s)
s = re.sub(r'^\s+(?u)','',s)
s = re.sub(r'\s+$(?u)','',s)
for m,r in replacements:
s = re.sub(unicode(m),unicode(r),s)
return s
Expand All @@ -121,23 +122,23 @@ def project_text(s,kind="exact"):
"""Project text onto a smaller subset of characters
for comparison."""
s = normalize_text(s)
s = re.sub(ur'( *[.] *){4,}',u'....',s) # dot rows
s = re.sub(ur'[~_]',u'',s) # dot rows
s = re.sub(r'( *[.] *){4,}',u'....',s) # dot rows
s = re.sub(r'[~_]',u'',s) # dot rows
if kind=="exact":
return s
if kind=="nospace":
return re.sub(ur'\s','',s)
return re.sub(r'\s','',s)
if kind=="spletdig":
return re.sub(ur'[^A-Za-z0-9 ]','',s)
return re.sub(r'[^A-Za-z0-9 ]','',s)
if kind=="letdig":
return re.sub(ur'[^A-Za-z0-9]','',s)
return re.sub(r'[^A-Za-z0-9]','',s)
if kind=="letters":
return re.sub(ur'[^A-Za-z]','',s)
return re.sub(r'[^A-Za-z]','',s)
if kind=="digits":
return re.sub(ur'[^0-9]','',s)
return re.sub(r'[^0-9]','',s)
if kind=="lnc":
s = s.upper()
return re.sub(ur'[^A-Z]','',s)
return re.sub(r'[^A-Z]','',s)
raise BadInput("unknown normalization: "+kind)

################################################################
Expand Down Expand Up @@ -222,7 +223,7 @@ def read_image_gray(fname,pageno=0):
The optional page number allows images from files containing multiple
images to be addressed. Byte and short arrays are rescaled to
the range 0...1 (unsigned) or -1...1 (signed)."""
if type(fname)==tuple: fname,pageno = fname
if isinstance(fname, tuple): fname,pageno = fname
assert pageno==0
pil = PIL.Image.open(fname)
a = pil2array(pil)
Expand All @@ -248,7 +249,7 @@ def write_image_gray(fname,image,normalize=0,verbose=0):
type, its values are clipped to the range [0,1],
multiplied by 255 and converted to unsigned bytes. Otherwise,
the image must be of type unsigned byte."""
if verbose: print "# writing",fname
if verbose: print("# writing", fname)
if isfloatarray(image):
image = array(255*clip(image,0.0,1.0),'B')
assert image.dtype==dtype('B'),"array has wrong dtype: %s"%image.dtype
Expand All @@ -259,7 +260,7 @@ def write_image_gray(fname,image,normalize=0,verbose=0):
def read_image_binary(fname,dtype='i',pageno=0):
"""Read an image from disk and return it as a binary image
of the given dtype."""
if type(fname)==tuple: fname,pageno = fname
if isinstance(fname, tuple): fname,pageno = fname
assert pageno==0
pil = PIL.Image.open(fname)
a = pil2array(pil)
Expand All @@ -271,7 +272,7 @@ def write_image_binary(fname,image,verbose=0):
"""Write a binary image to disk. This verifies first that the given image
is, in fact, binary. The image may be of any type, but must consist of only
two values."""
if verbose: print "# writing",fname
if verbose: print("# writing", fname)
assert image.ndim==2
image = array(255*(image>midrange(image)),'B')
im = array2pil(image)
Expand Down Expand Up @@ -428,7 +429,7 @@ def bbox(self,i):
"""Return the bounding box in raster coordinates
(row0,col0,row1,col1)."""
r = self.objects[i]
# print "@@@bbox",i,r
# print("@@@bbox", i, r)
return (r[0].start,r[1].start,r[0].stop,r[1].stop)
def bboxMath(self,i):
"""Return the bounding box in math coordinates
Expand All @@ -442,7 +443,7 @@ def length(self):
def mask(self,index,margin=0):
"""Return the mask for component index."""
b = self.objects[index]
#print "@@@mask",index,b
# print("@@@mask", index, b)
m = self.labels[b]
m[m!=index] = 0
if margin>0: m = pad_by(m,margin)
Expand Down Expand Up @@ -490,9 +491,10 @@ def save_object(fname,obj,zip=0):

def unpickle_find_global(mname,cname):
if mname=="lstm.lstm":
from . import lstm
return getattr(lstm,cname)
if not mname in sys.modules.keys():
exec "import "+mname
if not mname in sys.modules:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should have made this not in.

exec("import "+mname)
return getattr(sys.modules[mname],cname)

def load_object(fname,zip=0,nofind=0,verbose=0):
Expand All @@ -502,7 +504,7 @@ class names that have changed."""
if not nofind:
fname = ocropus_find_file(fname)
if verbose:
print "# loading object",fname
print("# loading object", fname)
if zip==0 and fname.endswith(".gz"):
zip = 1
if zip>0:
Expand Down Expand Up @@ -572,30 +574,30 @@ def parallel_map(fun,jobs,parallel=0,chunksize=1):
def check_valid_class_label(s):
"""Determines whether the given character is a valid class label.
Control characters and spaces are not permitted."""
if type(s)==unicode:
if isinstance(s, unicode):
if re.search(r'[\0-\x20]',s):
raise BadClassLabel(s)
elif type(s)==str:
elif isinstance(s, str):
if re.search(r'[^\x21-\x7e]',s):
raise BadClassLabel(s)
else:
raise BadClassLabel(s)

def summary(x):
"""Summarize a datatype as a string (for display and debugging)."""
if type(x)==numpy.ndarray:
if isinstance(x, numpy.ndarray):
return "<ndarray %s %s>"%(x.shape,x.dtype)
if type(x)==str and len(x)>10:
if isinstance(x, str) and len(x)>10:
return '"%s..."'%x
if type(x)==list and len(x)>10:
if isinstance(x, list) and len(x)>10:
return '%s...'%x
return str(x)

################################################################
### file name manipulation
################################################################

from default import getlocal
from .default import getlocal


@checks(str,_=str)
Expand Down Expand Up @@ -636,7 +638,7 @@ def base(path):
def write_text_simple(file,s):
"""Write the given string s to the output file."""
with open(file,"w") as stream:
if type(s)==unicode: s = s.encode("utf-8")
if isinstance(s, unicode): s = s.encode("utf-8")
stream.write(s)

@checks([str])
Expand Down Expand Up @@ -839,8 +841,8 @@ def pyconstruct(s):
path = s[:s.find("(")]
if "." in path:
module = path[:path.rfind(".")]
print "import",module
exec "import "+module in env
print("import", module)
exec("import "+module, env)
return eval(s,env)

def mkpython(name):
Expand All @@ -849,7 +851,7 @@ def mkpython(name):
doesn't look like a Python class."""
if name is None or len(name)==0:
return None
elif type(name) is not str:
elif not isinstance(name, str):
return name()
elif name[0]=="=":
return pyconstruct(name[1:])
Expand Down Expand Up @@ -896,15 +898,15 @@ def save_component(file,object,verbose=0,verify=0):
ocropus.save_component(file,object)
return
if verbose:
print "[save_component]"
print("[save_component]")
if verbose:
for k,v in object.__dict__.items():
print ":",k,obinfo(v)
print(":", k, obinfo(v))
with open(file,"wb") as stream:
pickle.dump(object,stream,pickle_mode)
if verify:
if verbose:
print "[trying to read it again]"
print("[trying to read it again]")
with open(file,"rb") as stream:
pickle.load(stream)

Expand Down Expand Up @@ -961,7 +963,7 @@ def draw_aligned(result,axis=None):
axis = subplot(111)
axis.imshow(NI(result.image),cmap=cm.gray)
cseg = result.cseg
if type(cseg)==numpy.ndarray: cseg = common.lseg2narray(cseg)
if isinstance(cseg, numpy.ndarray): cseg = common.lseg2narray(cseg)
ocropy.make_line_segmentation_black(cseg)
ocropy.renumber_labels(cseg,1)
bboxes = ocropy.rectarray()
Expand Down
2 changes: 2 additions & 0 deletions ocrolib/default.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# the defaults used by the recognizer

from __future__ import absolute_import, division, print_function

import os

modeldir = "/usr/local/share/ocropus/"
Expand Down
2 changes: 2 additions & 0 deletions ocrolib/edist.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import absolute_import, division, print_function

from scipy.ndimage import filters
from pylab import *
import re
Expand Down
10 changes: 6 additions & 4 deletions ocrolib/extras/cairoextras.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import absolute_import, division, print_function

import ctypes
import cairo

Expand Down Expand Up @@ -25,7 +27,7 @@ def create_cairo_font_face_for_file(filename, faceindex=0, loadoptions=0):
# initialize freetype
_ft_lib = ctypes.c_void_p()
if FT_Err_Ok != _freetype_so.FT_Init_FreeType(ctypes.byref(_ft_lib)):
raise "Error initialising FreeType library."
raise OSError("Error initialising FreeType library.")
_surface = cairo.ImageSurface(cairo.FORMAT_A8, 0, 0)
_initialized = True
# create freetype face
Expand All @@ -34,14 +36,14 @@ def create_cairo_font_face_for_file(filename, faceindex=0, loadoptions=0):
cairo_t = PycairoContext.from_address(id(cairo_ctx)).ctx
_cairo_so.cairo_ft_font_face_create_for_ft_face.restype = ctypes.c_void_p
if FT_Err_Ok != _freetype_so.FT_New_Face(_ft_lib, filename, faceindex, ctypes.byref(ft_face)):
raise "Error creating FreeType font face for " + filename
raise Exception("Error creating FreeType font face for " + filename)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I should probably make this a more specific error.

# create cairo font face for freetype face
cr_face = _cairo_so.cairo_ft_font_face_create_for_ft_face(ft_face, loadoptions)
if CAIRO_STATUS_SUCCESS != _cairo_so.cairo_font_face_status(cr_face):
raise "Error creating cairo font face for " + filename
raise Exception("Error creating cairo font face for " + filename)
_cairo_so.cairo_set_font_face(cairo_t, cr_face)
if CAIRO_STATUS_SUCCESS != _cairo_so.cairo_status(cairo_t):
raise "Error creating cairo font face for " + filename
raise Exception("Error creating cairo font face for " + filename)
face = cairo_ctx.get_font_face()
return face

Expand Down
Loading