Skip to content

Commit

Permalink
- Rename columns of non-UTF-8 shapefiles attributes before ingesting
Browse files Browse the repository at this point in the history
 - Fix test cases
  • Loading branch information
afabiani committed Oct 31, 2018
1 parent 2535b19 commit a514354
Show file tree
Hide file tree
Showing 7 changed files with 105 additions and 73 deletions.
14 changes: 12 additions & 2 deletions geonode/layers/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
import uuid
import decimal
import re

import cPickle as pickle
from django.db.models import Q
from celery.exceptions import TimeoutError

Expand Down Expand Up @@ -239,7 +239,7 @@ def layer_upload(request, template='upload/layer_upload.html'):
user=request.user).order_by('-date')
if latest_uploads.count() > 0:
upload_session = latest_uploads[0]
upload_session.error = str(error)
upload_session.error = pickle.dumps(error).decode("utf-8", "replace")
upload_session.traceback = traceback.format_exc(tb)
upload_session.context = log_snippet(CONTEXT_LOG_FILE)
upload_session.save()
Expand Down Expand Up @@ -286,6 +286,16 @@ def layer_upload(request, template='upload/layer_upload.html'):
layer_name = saved_layer.alternate if hasattr(
saved_layer, 'alternate') else name
request.add_resource('layer', layer_name)
_keys = ['info', 'errors']
for _k in _keys:
if _k in out:
if isinstance(out[_k], unicode) or isinstance(
out[_k], str):
out[_k] = out[_k].decode(saved_layer.charset).encode("utf-8")
elif isinstance(out[_k], dict):
for key, value in out[_k].iteritems():
out[_k][key] = out[_k][key].decode(saved_layer.charset).encode("utf-8")
out[_k][key.decode(saved_layer.charset).encode("utf-8")] = out[_k].pop(key)
return HttpResponse(
json.dumps(out),
content_type='application/json',
Expand Down
8 changes: 4 additions & 4 deletions geonode/tests/integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,9 +629,9 @@ def test_layer_zip_upload_non_utf8(self):
if os.path.exists(thelayer_path) and not os.path.exists(thelayer_zip):
zip_dir(thelayer_path, thelayer_zip)
if os.path.exists(thelayer_zip):
uploaded = file_upload(thelayer_zip, overwrite=True)
uploaded = file_upload(thelayer_zip, overwrite=True, charset='windows-1258')
self.assertEquals(uploaded.title, 'Zhejiang Yangcan Yanyu')
self.assertEquals(len(uploaded.keyword_list()), 0)
self.assertEquals(len(uploaded.keyword_list()), 2)
self.assertEquals(uploaded.constraints_other, None)
finally:
# Clean up and completely delete the layer
Expand All @@ -652,9 +652,9 @@ def test_layer_zip_upload_non_utf8(self):
if os.path.exists(thelayer_path) and not os.path.exists(thelayer_zip):
zip_dir(thelayer_path, thelayer_zip)
if os.path.exists(thelayer_zip):
uploaded = file_upload(thelayer_zip, overwrite=True)
uploaded = file_upload(thelayer_zip, overwrite=True, charset='windows-1258')
self.assertEquals(uploaded.title, 'Ming Female 1')
self.assertEquals(len(uploaded.keyword_list()), 0)
self.assertEquals(len(uploaded.keyword_list()), 2)
self.assertEquals(uploaded.constraints_other, None)
finally:
# Clean up and completely delete the layer
Expand Down
36 changes: 24 additions & 12 deletions geonode/upload/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@
'''

import os.path
from geoserver.resource import FeatureType
from geoserver.resource import Coverage

from geonode.utils import fixup_shp_columnnames
from geoserver.resource import FeatureType, Coverage
from django.utils.translation import ugettext as _

from UserList import UserList
Expand Down Expand Up @@ -256,17 +256,24 @@ def get_scan_hint(valid_extensions):
return result


def scan_file(file_name, scan_hint=None):
def scan_file(file_name, scan_hint=None, charset=None):
'''get a list of SpatialFiles for the provided file'''
if not os.path.exists(file_name):
raise Exception(_("Could not access to uploaded data."))

dirname = os.path.dirname(file_name)
if zipfile.is_zipfile(file_name):
paths, kept_zip = _process_zip(file_name, dirname, scan_hint=scan_hint)
paths, kept_zip = _process_zip(file_name,
dirname,
scan_hint=scan_hint,
charset=charset)
archive = file_name if kept_zip else None
else:
paths = [os.path.join(dirname, p) for p in os.listdir(dirname)]
paths = []
for p in os.listdir(dirname):
_f = os.path.join(dirname, p)
fixup_shp_columnnames(_f, charset)
paths.append(_f)
archive = None
if paths is not None:
safe_paths = _rename_files(paths)
Expand Down Expand Up @@ -305,7 +312,7 @@ def scan_file(file_name, scan_hint=None):
return SpatialFiles(dirname, found, archive=archive)


def _process_zip(zip_path, destination_dir, scan_hint=None):
def _process_zip(zip_path, destination_dir, scan_hint=None, charset=None):
"""Perform sanity checks on uploaded zip file
This function will check if the zip file's contents have legal names.
Expand All @@ -318,10 +325,10 @@ def _process_zip(zip_path, destination_dir, scan_hint=None):
safe_zip_path = _rename_files([zip_path])[0]
with zipfile.ZipFile(safe_zip_path, "r") as zip_handler:
if scan_hint in _keep_original_data:
extracted_paths = _extract_zip(zip_handler, destination_dir)
extracted_paths = _extract_zip(zip_handler, destination_dir, charset)
else:
extracted_paths = _sanitize_zip_contents(
zip_handler, destination_dir)
zip_handler, destination_dir, charset)
if extracted_paths is not None:
all_paths = extracted_paths
kept_zip = False
Expand All @@ -333,16 +340,21 @@ def _process_zip(zip_path, destination_dir, scan_hint=None):
return all_paths, kept_zip


def _sanitize_zip_contents(zip_handler, destination_dir):
def _sanitize_zip_contents(zip_handler, destination_dir, charset):
clean_macosx_dir(zip_handler.namelist())
result = _extract_zip(zip_handler, destination_dir)
result = _extract_zip(zip_handler, destination_dir, charset)
return result


def _extract_zip(zip_handler, destination):
def _extract_zip(zip_handler, destination, charset):
file_names = zip_handler.namelist()
zip_handler.extractall(destination)
return [os.path.join(destination, p) for p in file_names]
paths = []
for p in file_names:
_f = os.path.join(destination, p)
fixup_shp_columnnames(_f, charset)
paths.append(_f)
return paths


def _probe_zip_for_sld(zip_handler, destination_dir):
Expand Down
1 change: 1 addition & 0 deletions geonode/upload/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class LayerUploadForm(forms.Form):
shx_file = forms.FileField(required=False)
prj_file = forms.FileField(required=False)
xml_file = forms.FileField(required=False)
charset = forms.CharField(required=False)

if check_ogc_backend(geoserver.BACKEND_PACKAGE):
sld_file = forms.FileField(required=False)
Expand Down
25 changes: 11 additions & 14 deletions geonode/upload/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,20 +523,17 @@ def _get_layer_values(layer, upload_session, expand=0):
lyr = inDataSource.GetLayer(str(layer.name))
limit = 100
for feat in islice(lyr, 0, limit):
try:
feat_values = json_loads_byteified(feat.ExportToJson()).get('properties')
for k in feat_values.keys():
type_code = feat.GetFieldDefnRef(k).GetType()
binding = feat.GetFieldDefnRef(k).GetFieldTypeName(type_code)
feat_value = feat_values[k] if str(feat_values[k]) != 'None' else 0
if expand > 0:
ff = {'value': feat_value, 'binding': binding}
feat_values[k] = ff
else:
feat_values[k] = feat_value
layer_values.append(feat_values)
except BaseException:
pass
feat_values = json_loads_byteified(feat.ExportToJson()).get('properties')
for k in feat_values.keys():
type_code = feat.GetFieldDefnRef(k).GetType()
binding = feat.GetFieldDefnRef(k).GetFieldTypeName(type_code)
feat_value = feat_values[k] if str(feat_values[k]) != 'None' else 0
if expand > 0:
ff = {'value': feat_value, 'binding': binding}
feat_values[k] = ff
else:
feat_values[k] = feat_value
layer_values.append(feat_values)
return layer_values


Expand Down
10 changes: 6 additions & 4 deletions geonode/upload/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
from django.shortcuts import get_object_or_404
from django.shortcuts import render
from django.views.generic import CreateView, DeleteView
from geonode.utils import unzip_file
from geonode.utils import fixup_shp_columnnames
from geonode.base.enumerations import CHARSETS

from .forms import (
Expand Down Expand Up @@ -171,7 +171,8 @@ def save_step_view(req, session):
scan_hint = get_scan_hint(form.cleaned_data["valid_extensions"])
spatial_files = scan_file(
base_file,
scan_hint=scan_hint
scan_hint=scan_hint,
charset=form.cleaned_data["charset"]
)
logger.info("spatial_files: {}".format(spatial_files))
import_session = save_step(
Expand All @@ -191,14 +192,15 @@ def save_step_view(req, session):
)

sld = None

if spatial_files[0].sld_files:
sld = spatial_files[0].sld_files[0]
if not os.path.isfile(os.path.join(tempdir, spatial_files[0].base_file)):
tmp_files = [f for f in os.listdir(tempdir) if os.path.isfile(os.path.join(tempdir, f))]
for f in tmp_files:
if zipfile.is_zipfile(os.path.join(tempdir, f)):
unzip_file(os.path.join(tempdir, f), '.shp', tempdir=tempdir)
fixup_shp_columnnames(os.path.join(tempdir, f),
form.cleaned_data["charset"],
tempdir=tempdir)

_log('provided sld is %s' % sld)
# upload_type = get_upload_type(base_file)
Expand Down
84 changes: 47 additions & 37 deletions geonode/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
from django.core.serializers.json import DjangoJSONEncoder
from django.utils import timezone

from geonode import geoserver, qgis_server # noqa
from geonode import geoserver, qgis_server, GeoNodeException # noqa

try:
import json
Expand Down Expand Up @@ -965,14 +965,21 @@ def check_shp_columnnames(layer):
""" Check if shapefile for a given layer has valid column names.
If not, try to fix column names and warn the user
"""

# TODO we may add in a better location this method
inShapefile = ''
for f in layer.upload_session.layerfile_set.all():
if os.path.splitext(f.file.name)[1] == '.shp':
inShapefile = f.file.path
if inShapefile:
return fixup_shp_columnnames(inShapefile, layer.charset)


def fixup_shp_columnnames(inShapefile, charset, tempdir=None):
""" Try to fix column names and warn the user
"""

tempdir = tempfile.mkdtemp()
if not tempdir:
tempdir = tempfile.mkdtemp()
if is_zipfile(inShapefile):
inShapefile = unzip_file(inShapefile, '.shp', tempdir=tempdir)

Expand Down Expand Up @@ -1004,46 +1011,49 @@ def check_shp_columnnames(layer):

if a.match(field_name):
list_col_original.append(field_name)
try:
for i in range(0, inLayerDefn.GetFieldCount()):
charset = layer.charset if layer.charset and 'undefined' not in layer.charset \
else 'UTF-8'
field_name = unicode(
inLayerDefn.GetFieldDefn(i).GetName(),
charset)

if not a.match(field_name):
# once the field_name contains Chinese, to use slugify_zh
has_ch = False
for ch in field_name:
if u'\u4e00' <= ch <= u'\u9fff':

for i in range(0, inLayerDefn.GetFieldCount()):
charset = charset if charset and 'undefined' not in charset \
else 'UTF-8'

field_name = inLayerDefn.GetFieldDefn(i).GetName()
if not a.match(field_name):
# once the field_name contains Chinese, to use slugify_zh
has_ch = False
for ch in field_name:
try:
if u'\u4e00' <= ch.decode("utf-8", "replace") <= u'\u9fff':
has_ch = True
break
if has_ch:
new_field_name = slugify_zh(field_name, separator='_')
else:
new_field_name = custom_slugify(field_name)
if not b.match(new_field_name):
new_field_name = '_' + new_field_name
j = 0
while new_field_name in list_col_original or new_field_name in list_col.values():
if j == 0:
new_field_name += '_0'
if new_field_name.endswith('_' + str(j)):
j += 1
new_field_name = new_field_name[:-2] + '_' + str(j)
list_col.update({field_name: new_field_name})
except UnicodeDecodeError as e:
logger.error(str(e))
return False, None, None
except UnicodeDecodeError:
has_ch = True
break
if has_ch:
new_field_name = slugify_zh(field_name, separator='_')
else:
new_field_name = custom_slugify(field_name)
if not b.match(new_field_name):
new_field_name = '_' + new_field_name
j = 0
while new_field_name in list_col_original or new_field_name in list_col.values():
if j == 0:
new_field_name += '_0'
if new_field_name.endswith('_' + str(j)):
j += 1
new_field_name = new_field_name[:-2] + '_' + str(j)
list_col.update({field_name: new_field_name})

if len(list_col) == 0:
return True, None, None
else:
for key in list_col.keys():
qry = u"ALTER TABLE {0} RENAME COLUMN \"{1}\" TO \"{2}\"".format(
inLayer.GetName(), key, list_col[key])
inDataSource.ExecuteSQL(qry.encode(layer.charset))
try:
for key in list_col.keys():
qry = u"ALTER TABLE {} RENAME COLUMN \"".format(inLayer.GetName())
qry = qry + key.decode(charset) + u"\" TO \"{}\"".format(list_col[key])
inDataSource.ExecuteSQL(qry.encode(charset))
except UnicodeDecodeError:
raise GeoNodeException(
"Could not decode SHAPEFILE attributes by using the specified charset '{}'.".format(charset))
return True, None, list_col


Expand Down

0 comments on commit a514354

Please sign in to comment.