From d8a39b50c2def94f5eccf7de38007c31bd826666 Mon Sep 17 00:00:00 2001 From: Kipchirchir Sigei Date: Wed, 22 Mar 2023 22:43:52 +0300 Subject: [PATCH 1/4] update inefficient regex Signed-off-by: Kipchirchir Sigei --- onadata/apps/logger/models/xform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onadata/apps/logger/models/xform.py b/onadata/apps/logger/models/xform.py index 3fa1a08281..a423efe7fc 100644 --- a/onadata/apps/logger/models/xform.py +++ b/onadata/apps/logger/models/xform.py @@ -962,7 +962,7 @@ def _set_title(self): # Capture urls within form title if re.search( - r"^(?:http(s)?:\/\/)?[\w.-]+(?:\.[\w\.-]+)+[\w\-\._~:/?#[\]@!\$&'\(\)\*\+,;=.]+$", # noqa + r"^(?:http(s)?:\/\/)?[\w\-.]+(?:\.[\w\-.]+)+[\w\-.~:/?#[\]@!\$&'()*+,;=]+$", # noqa self.title, ): raise XLSFormError(_("Invalid title value; value shouldn't match a URL")) From 5b283c230f391c694821afae15ad4c7a27fb5b86 Mon Sep 17 00:00:00 2001 From: Kipchirchir Sigei Date: Wed, 22 Mar 2023 23:02:46 +0300 Subject: [PATCH 2/4] Use sha256 hashing instead of md5 Signed-off-by: Kipchirchir Sigei --- .../logger/migrations/0001_pre-django-3-upgrade.py | 6 +++--- .../apps/logger/migrations/0051_auto_20180522_1118.py | 4 ++-- onadata/apps/logger/models/attachment.py | 10 +++++----- onadata/apps/logger/models/xform.py | 10 +++++----- onadata/apps/main/models/meta_data.py | 6 +++--- onadata/libs/mixins/etags_mixin.py | 4 ++-- onadata/libs/serializers/xform_serializer.py | 10 +++++----- onadata/libs/utils/export_tools.py | 6 +++--- onadata/libs/utils/gravatar.py | 8 ++++---- 9 files changed, 32 insertions(+), 32 deletions(-) diff --git a/onadata/apps/logger/migrations/0001_pre-django-3-upgrade.py b/onadata/apps/logger/migrations/0001_pre-django-3-upgrade.py index 74bf18b414..b560e8fd38 100644 --- a/onadata/apps/logger/migrations/0001_pre-django-3-upgrade.py +++ b/onadata/apps/logger/migrations/0001_pre-django-3-upgrade.py @@ -14,7 +14,7 @@ from onadata.libs.utils.model_tools import queryset_iterator from onadata.libs.utils.logger_tools import create_xform_version import taggit.managers -from hashlib import md5 +from hashlib import sha256 def recalculate_xform_hash(apps, schema_editor): # pylint: disable=W0613 @@ -29,8 +29,8 @@ def recalculate_xform_hash(apps, schema_editor): # pylint: disable=W0613 counter = 0 for xform in queryset_iterator(xforms, 500): - hash_value = md5(xform.xml.encode("utf8")).hexdigest() - xform.hash = f"md5:{hash_value}" + hash_value = sha256(xform.xml.encode("utf8")).hexdigest() + xform.hash = f"sha256:{hash_value}" xform.save(update_fields=["hash"]) counter += 1 if counter % 500 == 0: diff --git a/onadata/apps/logger/migrations/0051_auto_20180522_1118.py b/onadata/apps/logger/migrations/0051_auto_20180522_1118.py index 7e8067b2bd..8696da884d 100644 --- a/onadata/apps/logger/migrations/0051_auto_20180522_1118.py +++ b/onadata/apps/logger/migrations/0051_auto_20180522_1118.py @@ -6,7 +6,7 @@ from __future__ import unicode_literals from django.db import migrations -from hashlib import md5 +from hashlib import sha256 from onadata.libs.utils.model_tools import queryset_iterator @@ -23,7 +23,7 @@ def recalculate_xform_hash(apps, schema_editor): # pylint: disable=W0613 counter = 0 for xform in queryset_iterator(xforms, 500): - xform.hash = "md5:%s" % md5(xform.xml.encode("utf8")).hexdigest() + xform.hash = "sha256:%s" % sha256(xform.xml.encode("utf8")).hexdigest() xform.save(update_fields=["hash"]) counter += 1 if counter % 500 == 0: diff --git a/onadata/apps/logger/models/attachment.py b/onadata/apps/logger/models/attachment.py index 5a58a186b3..3dbb2c3f01 100644 --- a/onadata/apps/logger/models/attachment.py +++ b/onadata/apps/logger/models/attachment.py @@ -94,13 +94,13 @@ def save(self, *args, **kwargs): @property def file_hash(self): - """Returns the MD5 hash of the file.""" - md5_hash = "" + """Returns the sha256 hash of the file.""" + sha256_hash = "" if self.media_file.storage.exists(self.media_file.name): - md5_hash = hashlib.new( - "md5", self.media_file.read(), usedforsecurity=False + sha256_hash = hashlib.new( + "sha256", self.media_file.read(), usedforsecurity=False ).hexdigest() - return md5_hash + return sha256_hash @property def filename(self): diff --git a/onadata/apps/logger/models/xform.py b/onadata/apps/logger/models/xform.py index a423efe7fc..78242f1a51 100644 --- a/onadata/apps/logger/models/xform.py +++ b/onadata/apps/logger/models/xform.py @@ -970,14 +970,14 @@ def _set_title(self): self.title = title_xml def get_hash(self): - """Returns the MD5 hash of the forms XML content prefixed by 'md5:'""" - md5_hash = hashlib.new( - "md5", self.xml.encode("utf-8"), usedforsecurity=False + """Returns the sha256 hash of the forms XML content prefixed by 'sha256:'""" + sha256_hash = hashlib.new( + "sha256", self.xml.encode("utf-8"), usedforsecurity=False ).hexdigest() - return f"md5:{md5_hash}" + return f"sha256:{sha256_hash}" def set_hash(self): - """Sets the MD5 hash of the form.""" + """Sets the sha256 hash of the form.""" self.hash = self.get_hash() def _set_encrypted_field(self): diff --git a/onadata/apps/main/models/meta_data.py b/onadata/apps/main/models/meta_data.py index 0ac4e936d0..18aff3cc3d 100644 --- a/onadata/apps/main/models/meta_data.py +++ b/onadata/apps/main/models/meta_data.py @@ -222,7 +222,7 @@ def hash(self): def set_hash(self): """ - Returns the md5 hash of the metadata file. + Returns the sha256 hash of the metadata file. """ if not self.data_file: return None @@ -238,9 +238,9 @@ def set_hash(self): return "" else: file_hash = hashlib.new( - "md5", self.data_file.read(), usedforsecurity=False + "sha256", self.data_file.read(), usedforsecurity=False ).hexdigest() - self.file_hash = f"md5:{file_hash}" + self.file_hash = f"sha256:{file_hash}" return self.file_hash diff --git a/onadata/libs/mixins/etags_mixin.py b/onadata/libs/mixins/etags_mixin.py index 144d81384b..9b5684c5df 100644 --- a/onadata/libs/mixins/etags_mixin.py +++ b/onadata/libs/mixins/etags_mixin.py @@ -4,7 +4,7 @@ Adds Etag headers to the viewset response. """ -from hashlib import md5 +from hashlib import sha256 MODELS_WITH_DATE_MODIFIED = ( "XForm", @@ -30,7 +30,7 @@ class ETagsMixin: def set_etag_header(self, etag_value, etag_hash=None): """Updates the response headers with Etag header""" if etag_value: - etag_hash = md5(str(etag_value).encode("utf-8")).hexdigest() + etag_hash = sha256(str(etag_value).encode("utf-8")).hexdigest() if etag_hash: self.headers.update({"ETag": etag_hash}) diff --git a/onadata/libs/serializers/xform_serializer.py b/onadata/libs/serializers/xform_serializer.py index 819c5c19f9..dad8a53db9 100644 --- a/onadata/libs/serializers/xform_serializer.py +++ b/onadata/libs/serializers/xform_serializer.py @@ -650,7 +650,7 @@ def get_url(self, obj): @check_obj def get_hash(self, obj): """ - Returns MD5 hash based on last_submission_time for a media linked form. + Returns sha256 hash based on last_submission_time for a media linked form. """ filename = obj.data_value hsh = obj.file_hash @@ -674,14 +674,14 @@ def get_hash(self, obj): xform = data_view.xform if xform and xform.last_submission_time: - md5_hash = hashlib.new( - "md5", + sha256_hash = hashlib.new( + "sha256", xform.last_submission_time.isoformat().encode("utf-8"), usedforsecurity=False, ).hexdigest() - hsh = f"md5:{md5_hash}" + hsh = f"sha256:{sha256_hash}" - return f"{hsh or 'md5:'}" + return f"{hsh or 'sha256:'}" @check_obj def get_filename(self, obj): diff --git a/onadata/libs/utils/export_tools.py b/onadata/libs/utils/export_tools.py index 0bd4617d14..4c313a87f2 100644 --- a/onadata/libs/utils/export_tools.py +++ b/onadata/libs/utils/export_tools.py @@ -64,11 +64,11 @@ User = get_user_model() -def md5hash(string): +def sha256hash(string): """ - Return the MD5 hex digest of the given string. + Return the sha256 hex digest of the given string. """ - return hashlib.md5(string).hexdigest() + return hashlib.sha256(string).hexdigest() def get_export_options(options): diff --git a/onadata/libs/utils/gravatar.py b/onadata/libs/utils/gravatar.py index f508e32ed0..d85535e2f0 100644 --- a/onadata/libs/utils/gravatar.py +++ b/onadata/libs/utils/gravatar.py @@ -12,10 +12,10 @@ GRAVATAR_SIZE = str(60) -def email_md5(user): +def email_sha256(user): """Returns the hash of an email for the user""" return hashlib.new( - "md5", user.email.lower().encode("utf-8"), usedforsecurity=False + "sha256", user.email.lower().encode("utf-8"), usedforsecurity=False ).hexdigest() @@ -23,7 +23,7 @@ def get_gravatar_img_link(user): """Returns the Gravatar image URL""" return ( GRAVATAR_ENDPOINT - + email_md5(user) + + email_sha256(user) + "?" + urlencode({"d": DEFAULT_GRAVATAR, "s": str(GRAVATAR_SIZE)}) ) @@ -31,5 +31,5 @@ def get_gravatar_img_link(user): def gravatar_exists(user): """Checks if the Gravatar URL exists""" - url = GRAVATAR_ENDPOINT + email_md5(user) + "?" + "d=404" + url = GRAVATAR_ENDPOINT + email_sha256(user) + "?" + "d=404" return requests.get(url).status_code != 404 From 1261365875bbb749436f5969f3cb7ee44c5eac10 Mon Sep 17 00:00:00 2001 From: Kipchirchir Sigei Date: Thu, 23 Mar 2023 09:24:34 +0300 Subject: [PATCH 3/4] Improve URL and file name validation Signed-off-by: Kipchirchir Sigei --- onadata/apps/main/forms.py | 7 +++++++ onadata/libs/utils/export_tools.py | 32 ++++++++++++++++++++---------- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/onadata/apps/main/forms.py b/onadata/apps/main/forms.py index 3b7b7f9b58..99fa79a135 100644 --- a/onadata/apps/main/forms.py +++ b/onadata/apps/main/forms.py @@ -407,6 +407,13 @@ def publish(self, user, id_string=None, created_by=None): if cleaned_url: self.validate(cleaned_url) cleaned_xls_file = urlparse(cleaned_url) + + if cleaned_xls_file.scheme not in ["http", "https"]: + raise forms.ValidationError("Invalid URL scheme") + + if not cleaned_xls_file.netloc: + raise forms.ValidationError("Invalid URL") + cleaned_xls_file = "_".join(cleaned_xls_file.path.split("/")[-2:]) name, extension = os.path.splitext(cleaned_xls_file) diff --git a/onadata/libs/utils/export_tools.py b/onadata/libs/utils/export_tools.py index 4c313a87f2..17cf5aff27 100644 --- a/onadata/libs/utils/export_tools.py +++ b/onadata/libs/utils/export_tools.py @@ -59,6 +59,8 @@ SUPPORTED_INDEX_TAGS = ("[", "]", "(", ")", "{", "}", ".", "_") EXPORT_QUERY_KEY = "query" MAX_RETRIES = 3 +SAFE_FILENAME_REGEX = re.compile(r"^[a-zA-Z0-9_-]{1,100}\.[a-zA-Z0-9]{1,10}$") +MAX_FILENAME_LENGTH = 256 # pylint: disable=invalid-name User = get_user_model() @@ -391,18 +393,26 @@ def increment_index_in_filename(filename): filename should be in the form file.ext or file-2.ext - we check for the dash and index and increment appropriately """ - # check for an index i.e. dash then number then dot extension - regex = re.compile(r"(.+?)\-(\d+)(\..+)") - match = regex.match(filename) - if match: - basename = match.groups()[0] - index = int(match.groups()[1]) + 1 - ext = match.groups()[2] + new_filename = "" + + # Validate the input to prevent DoS attacks + if len(filename) > MAX_FILENAME_LENGTH: + raise Exception("Filename is too long.") + elif not SAFE_FILENAME_REGEX.match(filename): + raise Exception("Filename is not in a safe format.") else: - index = 1 - # split filename from ext - basename, ext = os.path.splitext(filename) - new_filename = f"{basename}-{index}{ext}" + # check for an index i.e. dash then number then dot extension + regex = re.compile(r"(.+?)\-(\d+)(\..+)") + match = regex.match(filename) + if match: + basename = match.groups()[0] + index = int(match.groups()[1]) + 1 + ext = match.groups()[2] + else: + index = 1 + # split filename from ext + basename, ext = os.path.splitext(filename) + new_filename = f"{basename}-{index}{ext}" return new_filename From f40ced49c403c6b6874e810a6969a9a55766543c Mon Sep 17 00:00:00 2001 From: Kipchirchir Sigei Date: Mon, 3 Apr 2023 08:55:58 +0300 Subject: [PATCH 4/4] Update hash fields max length on xform and metadata models Signed-off-by: Kipchirchir Sigei --- .../0005_update_xform_hash_max_length.py | 18 ++++++++++++++++++ onadata/apps/logger/models/xform.py | 2 +- .../0013_update_metadata_hash_max_length.py | 18 ++++++++++++++++++ onadata/apps/main/models/meta_data.py | 2 +- 4 files changed, 38 insertions(+), 2 deletions(-) create mode 100644 onadata/apps/logger/migrations/0005_update_xform_hash_max_length.py create mode 100644 onadata/apps/main/migrations/0013_update_metadata_hash_max_length.py diff --git a/onadata/apps/logger/migrations/0005_update_xform_hash_max_length.py b/onadata/apps/logger/migrations/0005_update_xform_hash_max_length.py new file mode 100644 index 0000000000..7076f9cff8 --- /dev/null +++ b/onadata/apps/logger/migrations/0005_update_xform_hash_max_length.py @@ -0,0 +1,18 @@ +# Generated by Django 3.2.18 on 2023-04-02 20:06 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('logger', '0004_update_instance_geoms'), + ] + + operations = [ + migrations.AlterField( + model_name='xform', + name='hash', + field=models.CharField(blank=True, default=None, max_length=128, null=True, verbose_name='Hash'), + ), + ] diff --git a/onadata/apps/logger/models/xform.py b/onadata/apps/logger/models/xform.py index 78242f1a51..81cedbc370 100644 --- a/onadata/apps/logger/models/xform.py +++ b/onadata/apps/logger/models/xform.py @@ -889,7 +889,7 @@ class XForm(XFormMixin, BaseModel): has_hxl_support = models.BooleanField(default=False) last_updated_at = models.DateTimeField(auto_now=True) hash = models.CharField( - _("Hash"), max_length=36, blank=True, null=True, default=None + _("Hash"), max_length=128, blank=True, null=True, default=None ) # XForm was created as a merged dataset is_merged_dataset = models.BooleanField(default=False) diff --git a/onadata/apps/main/migrations/0013_update_metadata_hash_max_length.py b/onadata/apps/main/migrations/0013_update_metadata_hash_max_length.py new file mode 100644 index 0000000000..73fbb32acb --- /dev/null +++ b/onadata/apps/main/migrations/0013_update_metadata_hash_max_length.py @@ -0,0 +1,18 @@ +# Generated by Django 3.2.18 on 2023-04-02 20:06 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('main', '0012_metadata_extra_data'), + ] + + operations = [ + migrations.AlterField( + model_name='metadata', + name='file_hash', + field=models.CharField(blank=True, max_length=128, null=True), + ), + ] diff --git a/onadata/apps/main/models/meta_data.py b/onadata/apps/main/models/meta_data.py index 18aff3cc3d..ab2c864940 100644 --- a/onadata/apps/main/models/meta_data.py +++ b/onadata/apps/main/models/meta_data.py @@ -189,7 +189,7 @@ class MetaData(models.Model): extra_data = models.JSONField(default=dict, blank=True, null=True) data_file = models.FileField(upload_to=upload_to, blank=True, null=True) data_file_type = models.CharField(max_length=255, blank=True, null=True) - file_hash = models.CharField(max_length=50, blank=True, null=True) + file_hash = models.CharField(max_length=128, blank=True, null=True) date_created = models.DateTimeField(null=True, auto_now_add=True) date_modified = models.DateTimeField(null=True, auto_now=True) deleted_at = models.DateTimeField(null=True, default=None)