diff --git a/doc/Sphinx/source/User/dataverse-management.rst b/doc/Sphinx/source/User/dataverse-management.rst index 2ce049e3881..d98872e71a5 100644 --- a/doc/Sphinx/source/User/dataverse-management.rst +++ b/doc/Sphinx/source/User/dataverse-management.rst @@ -19,36 +19,40 @@ Creating a dataverse is easy but first you must be a registered user (see Create #. Once on the "New Dataverse" page fill in the following fields: * Enter the name of your Dataverse. * **Host Dataverse**: select which dataverse you would like this new dataverse to belong to. By default it will be a child dataverse of the parent you clicked from. - * **Dataverse Alias**: This is an abbreviation, usually lower-case, that becomes part of the URL for the new dataverse. Special characters (~,\`, !, @, #, $, %, ^, &, and \*) and spaces are not allowed. **Note**: if you change the Dataverse Alias field, the URL for your Dataverse changes (http//.../dv/'alias'), which affects links to this page. + * **Alias**: This is an abbreviation, usually lower-case, that becomes part of the URL for the new dataverse. Special characters (~,\`, !, @, #, $, %, ^, &, and \*) and spaces are not allowed. **Note**: if you change the Dataverse Alias field, the URL for your Dataverse changes (http//.../dv/'alias'), which affects links to this page. * **E-mail**: This is the email address you will receive notifications for this particular Dataverse. * **Affiliation**: Add any Affiliation that can be associated to this particular dataverse (e.g., project name, institute name, department name, journal name, etc). * **Description**: Provide a description of this dataverse (max. 1000 characters). This will display on the home page of your dataverse and in the search result list. * **Choose the sets of Metadata Elements for datasets in this Dataverse**: by default the metadata elements will be from the host dataverse that this new dataverse is created in. 3. Click "Create Dataverse" button and you're done! An email will be sent to you with more information, including the URL to access your new dataverse. -\*Required information can vary depending on site policy. Required fields are noted with a red asterisk. +\*Required fields are denoted by a red asterisk. Edit Dataverse ================= -To edit your Dataverse, navigate to your Dataverse homepage and select the "Edit Dataverse" button. +To edit your Dataverse, navigate to your Dataverse homepage and select the "Edit Dataverse" button, +where you will be presented with the following editing options. -- **General Information**: edit name, host dataverse, alias, email, description, and affilitation for your dataverse. +- **General Information**: edit name, host dataverse, alias, email, + description, affilitation, and Metadata Elements for your dataverse. - **Roles + Permissions** for this particular dataverse -- **Setup**: Edit the Metadata elements and Facets you want to associate with your dataverse. Note: facets will appear in the order shown on the list. +- **Setup**: Update the Facets you want to associate with your dataverse. + Note: facets will appear in the order shown on the list. Publish Your Dataverse ================================================================= -Once your dataverse is ready to go public, go to your dataverse page, click on the "Private" button on the right +Once your dataverse is ready to go public, go to your dataverse page, click on the "Unpublished" button on the right hand side of the page which should indicate: -"This dataverse is Private. To make it public click 'Publish dataverse' link." +"This dataverse is Unpublished. To publish it click 'Publish dataverse' link." Once you click "Publish dataverse" it +will be made public. -**Important Note**: Once a dataverse is made public it can no longer be un-published. +**Important Note**: Once a dataverse is made public it can no longer be unpublished. -.. |image1| image:: ./img/Dataverses-Datasets.png +.. |image1| image:: ./img/Dataverse-Diagram.png diff --git a/doc/Sphinx/source/User/img/Dataverse-Diagram.png b/doc/Sphinx/source/User/img/Dataverse-Diagram.png new file mode 100644 index 00000000000..77c12c7ce44 Binary files /dev/null and b/doc/Sphinx/source/User/img/Dataverse-Diagram.png differ diff --git a/pom.xml b/pom.xml index c7b4f8ec1d5..dfda62dd6ff 100644 --- a/pom.xml +++ b/pom.xml @@ -20,6 +20,13 @@ PrimeFaces Maven Repository http://repository.primefaces.org default + + + + geotk-repo + Geo Toolkit Maven Repository + http://maven.geotoolkit.org + default central-repo @@ -66,7 +73,6 @@ 7.0 provided - org.primefaces primefaces @@ -169,6 +175,23 @@ jhove-handler 1.11.0 + + + javax.media + jai_imageio + 1.1.1 + + + javax.media + jai_core + 1.1.3 + + + javax.media + jai_codec + 1.1.3 + + diff --git a/scripts/installer/install b/scripts/installer/install index 4acced60f72..d5e1cef080a 100755 --- a/scripts/installer/install +++ b/scripts/installer/install @@ -827,7 +827,7 @@ print "FOR EXAMPLE, IF A CONFIGURATION SETTING THAT WE ARE TRYING\n"; print "TO CREATE ALREADY EXISTS; OR IF A JVM OPTION THAT WE ARE\n"; print "DELETING DOESN'T. THESE \"FAILURES\" ARE NORMAL!\n"; print "*********************\n\n"; -print "When asadmin asks you to \"Enter admin user name\",\n"; +print "When/if asadmin asks you to \"Enter admin user name\",\n"; print "it should be safe to hit return and accept the default\n"; print "(which is \"admin\").\n"; @@ -838,6 +838,20 @@ system "stty cbreak /dev/tty 2>&1"; system "stty -cbreak /dev/tty 2>&1"; print "\n"; +# start domain, if not running: + +my $DOMAIN_DOWN=`$CONFIG_DEFAULTS{'GLASSFISH_DIRECTORY'}/bin/asadmin list-domains | grep "$DOMAIN " | grep "not running"`; +print STDERR $DOMAIN_DOWN . "\n"; +if ($DOMAIN_DOWN) +{ + print "Trying to start domain up...\n"; + system ($CONFIG_DEFAULTS{'GLASSFISH_DIRECTORY'}."/bin/asadmin start-domain domain1"); +} +else +{ + print "domain appears to be up...\n"; +} + # create asadmin login, so that the user doesn't have to enter # the username and password for every asadmin command, if # access to :4848 is password-protected: diff --git a/scripts/search/tests/dataset-versioning04 b/scripts/search/tests/dataset-versioning04 index 9322f739a30..5ae34c7248c 100755 --- a/scripts/search/tests/dataset-versioning04 +++ b/scripts/search/tests/dataset-versioning04 @@ -1,18 +1,24 @@ #!/bin/bash # We assume you've done everything in scripts/search/tests/dataset-versioning03 -# There should now be two Solr documents with different titles: +# There should now be two Solr documents for the same dataset with different titles: # +# Version 1.0 has the published title and a single published file # Title: Rings of Trees and Other Observations -# to +# files: 18:trees.png +# +# The version that is post 1.0 has a different title # Title: Rings of Conifers and Other Observations +# files: 18:trees.png +# +# Now let's add a second file to the unpublished version of the study: +# +# http://localhost:8080/dataset.xhtml?id=17 +# +# The new file should be named trees2.png and have a description of +# "Another tree image." # -# anon should be able to see the published 1.0 version but not the new draft (no change from dataset-versioning02anon) -#diff -u <(curl -s 'http://localhost:8080/api/search?q=trees&showrelevance=true') scripts/search/tests/expected/dataset-versioning02anon -# pete should be able to see the published version 1.0 with published=true (same as anon) -#diff -u <(curl -s 'http://localhost:8080/api/search?q=trees&key=pete&published=true') scripts/search/tests/expected/dataset-versioning03pete-published-only -# pete should be able to see the newer draft version with unpublished=true -#diff -u <(curl -s 'http://localhost:8080/api/search?q=trees&key=pete&unpublished=true') scripts/search/tests/expected/dataset-versioning03pete-unpublished-only -# pete should see just the published version by default -diff -u <(curl -s 'http://localhost:8080/api/search?q=trees&key=pete') scripts/search/tests/expected/dataset-versioning03pete-published-only -# here's the solr doc for the dataset -#diff -u scripts/search/tests/expected/dataset-versioning02dataset_17solr <(curl -s 'http://localhost:8983/solr/collection1/select?rows=100&wt=json&indent=true&q=id:dataset_17') | egrep -v '_version_|release_or_create_date_dt' +# anon should be able to see the published 1.0 version but not the new draft and not the new file +# (no change from dataset-versioning02anon) +diff -u <(curl -s 'http://localhost:8080/api/search?q=trees&showrelevance=true') scripts/search/tests/expected/dataset-versioning02anon +# pete should be able to see the new unpublished file +diff -u <(curl -s 'http://localhost:8080/api/search?q=trees&key=pete') scripts/search/tests/expected/dataset-versioning04pete diff --git a/scripts/search/tests/dataset-versioning05 b/scripts/search/tests/dataset-versioning05 new file mode 100755 index 00000000000..ff8bc9a52d5 --- /dev/null +++ b/scripts/search/tests/dataset-versioning05 @@ -0,0 +1,24 @@ +#!/bin/bash +# We assume you've done everything in scripts/search/tests/dataset-versioning04 +# There should now be two Solr documents for datasets with different titles. +# +# Version 1.0 has a single file +# Title: Rings of Trees and Other Observations +# files: 18:trees.png +# +# The version that is post 1.0 but unpublished has two files +# Title: Rings of Conifers and Other Observations +# files: 18:trees.png, 19:trees2.png +# +# Now let's change the description of the published file (18:trees.png) from +# "Trees are lovely." +# to +# "The first picture of trees I uploaded." +# +# anon should be able to see the published 1.0 version but not the new draft and not the new file +# and not the change in description +# (no change from dataset-versioning02anon) +diff -u <(curl -s 'http://localhost:8080/api/search?q=trees&showrelevance=true') scripts/search/tests/expected/dataset-versioning02anon +# What about pete? should he see multiple cards for the two versions +# (with different descriptions) of 18:trees.png? Right now there is only one +# card per file and for published files it always shows the published information. diff --git a/scripts/search/tests/expected/dataset-versioning02anon b/scripts/search/tests/expected/dataset-versioning02anon index 83ceaaf248a..811cd343f02 100644 --- a/scripts/search/tests/expected/dataset-versioning02anon +++ b/scripts/search/tests/expected/dataset-versioning02anon @@ -26,7 +26,7 @@ }, { "id":"dataset_17", - "matched_fields":"[dsDescription, title, citation_t, notesText, authorAffiliation, authorName, keyword, contributorName]", + "matched_fields":"[dsDescription, title, notesText, authorAffiliation, authorName, keyword, contributorName]", "detailsArray":[ { "dsDescription":[ @@ -38,11 +38,6 @@ "Rings of Trees and Other Observations" ] }, - { - "citation_t":[ - "Tree, Tony, 2014, \"Rings of Trees and Other Observations\", http://dx.doi.org/10.5072/FK2/17, Root" - ] - }, { "notesText":[ "Many notes have been taken about trees over the years." diff --git a/scripts/search/tests/expected/dataset-versioning04pete b/scripts/search/tests/expected/dataset-versioning04pete new file mode 100644 index 00000000000..0b718de71a4 --- /dev/null +++ b/scripts/search/tests/expected/dataset-versioning04pete @@ -0,0 +1,10 @@ + +{ + "q":"trees", + "fq_provided":"[]", + "fq_actual":"[({!join from=groups_s to=perms_ss}id:group_public OR {!join from=groups_s to=perms_ss}id:group_user1)]", + "total_count":7, + "start":0, + "count_in_response":7, + "items":"[datafile_18:trees.png:18, datafile_19:trees2.png:19, dataset_17:Rings of Trees and Other Observations:17, dataset_17_draft:Rings of Conifers and Other Observations:17, dataverse_10:Birds:10, dataverse_11:Trees:11, dataverse_16:Chestnut Trees:16]" +} \ No newline at end of file diff --git a/scripts/search/tests/expected/highlighting-nick-trees b/scripts/search/tests/expected/highlighting-nick-trees index 5f235e02ab2..87091ece143 100644 --- a/scripts/search/tests/expected/highlighting-nick-trees +++ b/scripts/search/tests/expected/highlighting-nick-trees @@ -26,7 +26,7 @@ }, { "id":"dataset_17_draft", - "matched_fields":"[dsDescription, title, citation_t, notesText, authorAffiliation, authorName, keyword, contributorName]", + "matched_fields":"[dsDescription, title, notesText, authorAffiliation, authorName, keyword, contributorName]", "detailsArray":[ { "dsDescription":[ @@ -38,11 +38,6 @@ "Rings of Trees and Other Observations" ] }, - { - "citation_t":[ - "Tree, Tony, 2014, \"Rings of Trees and Other Observations\", http://dx.doi.org/10.5072/FK2/17, Root" - ] - }, { "notesText":[ "Many notes have been taken about trees over the years." diff --git a/scripts/search/tests/expected/highlighting-pete-trees b/scripts/search/tests/expected/highlighting-pete-trees index 2fa343fc37c..9564910dd0a 100644 --- a/scripts/search/tests/expected/highlighting-pete-trees +++ b/scripts/search/tests/expected/highlighting-pete-trees @@ -26,7 +26,7 @@ }, { "id":"dataset_17_draft", - "matched_fields":"[dsDescription, title, citation_t, notesText, authorAffiliation, authorName, keyword, contributorName]", + "matched_fields":"[dsDescription, title, notesText, authorAffiliation, authorName, keyword, contributorName]", "detailsArray":[ { "dsDescription":[ @@ -38,11 +38,6 @@ "Rings of Trees and Other Observations" ] }, - { - "citation_t":[ - "Tree, Tony, 2014, \"Rings of Trees and Other Observations\", http://dx.doi.org/10.5072/FK2/17, Root" - ] - }, { "notesText":[ "Many notes have been taken about trees over the years." diff --git a/scripts/search/tests/highlighting b/scripts/search/tests/highlighting index da57ff28a4a..bf01ee1e6c1 100755 --- a/scripts/search/tests/highlighting +++ b/scripts/search/tests/highlighting @@ -28,11 +28,6 @@ # Type: Data Collector # Name: Edward Trees Jr. # -# Until https://redmine.hmdc.harvard.edu/issues/3778 is complete: -# Add the following -# Production Date: 2014-05-02 -# (Otherwise you won't be able to publish your dataset.) -# # We assume you add a file called "trees.png" to this dataset # with a description of "Trees are lovely." # diff --git a/src/main/java/edu/harvard/iq/dataverse/AdvancedSearchPage.java b/src/main/java/edu/harvard/iq/dataverse/AdvancedSearchPage.java index b4a46b5549b..e897611d3cc 100644 --- a/src/main/java/edu/harvard/iq/dataverse/AdvancedSearchPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/AdvancedSearchPage.java @@ -128,40 +128,78 @@ public String find() throws IOException { * see also https://redmine.hmdc.harvard.edu/issues/3745 */ if (!dvFieldName.isEmpty()) { - queryBuilder = new StringBuilder(); - queryBuilder.append(SearchFields.DATAVERSE_NAME + ":" + dvFieldName); + queryBuilder = constructQuery(SearchFields.DATAVERSE_NAME, dvFieldName); } if (!dvFieldAffiliation.isEmpty()) { - queryBuilder = new StringBuilder(); - queryBuilder.append(SearchFields.DATAVERSE_AFFILIATION + ":" + dvFieldAffiliation); + queryBuilder = constructQuery(SearchFields.DATAVERSE_AFFILIATION, dvFieldAffiliation); } if (!dvFieldDescription.isEmpty()) { - queryBuilder = new StringBuilder(); - queryBuilder.append(SearchFields.DATAVERSE_DESCRIPTION + ":" + dvFieldDescription); + queryBuilder = constructQuery(SearchFields.DATAVERSE_DESCRIPTION, dvFieldDescription); } if (!fileFieldName.isEmpty()) { - queryBuilder = new StringBuilder(); - queryBuilder.append(SearchFields.FILE_NAME + ":" + fileFieldName); + queryBuilder = constructQuery(SearchFields.FILE_NAME, fileFieldName); } if (!fileFieldDescription.isEmpty()) { - queryBuilder = new StringBuilder(); - queryBuilder.append(SearchFields.FILE_DESCRIPTION + ":" + fileFieldDescription); + queryBuilder = constructQuery(SearchFields.FILE_DESCRIPTION, fileFieldDescription); } if (!fileFieldFiletype.isEmpty()) { - queryBuilder = new StringBuilder(); - queryBuilder.append(SearchFields.FILE_TYPE_SEARCHABLE + ":" + fileFieldFiletype); + queryBuilder = constructQuery(SearchFields.FILE_TYPE_SEARCHABLE, fileFieldFiletype); } } return "/dataverse.xhtml?q=" + queryBuilder.toString().trim() + "faces-redirect=true"; } - + + + /** + * @todo have the code that operates on dataset fields call into this? + */ + private StringBuilder constructQuery(String solrField, String userSuppliedQuery) { + + StringBuilder queryBuilder = new StringBuilder(); + String delimiter = "[\"]+"; + + List queryStrings = new ArrayList(); + + if (userSuppliedQuery != null && !userSuppliedQuery.equals("")) { + if (userSuppliedQuery.contains("\"")) { + String[] tempString = userSuppliedQuery.split(delimiter); + for (int i = 1; i < tempString.length; i++) { + if (!tempString[i].equals(" ") && !tempString[i].isEmpty()) { + queryStrings.add(solrField + ":" + "\"" + tempString[i].trim() + "\""); + } + } + } else { + StringTokenizer st = new StringTokenizer(userSuppliedQuery); + while (st.hasMoreElements()) { + queryStrings.add(solrField + ":" + st.nextElement()); + } + } + } + + if (queryStrings.size() > 1) { + queryBuilder.append("("); + } + + for (int i = 0; i < queryStrings.size(); i++) { + if (i > 0) { + queryBuilder.append(" "); + } + queryBuilder.append(queryStrings.get(i)); + } + + if (queryStrings.size() > 1) { + queryBuilder.append(")"); + } + + return queryBuilder; + } public Dataverse getDataverse() { return dataverse; diff --git a/src/main/java/edu/harvard/iq/dataverse/ControlledVocabularyValueConverter.java b/src/main/java/edu/harvard/iq/dataverse/ControlledVocabularyValueConverter.java index e46fc259f64..6be05d1024c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ControlledVocabularyValueConverter.java +++ b/src/main/java/edu/harvard/iq/dataverse/ControlledVocabularyValueConverter.java @@ -23,8 +23,12 @@ public class ControlledVocabularyValueConverter implements Converter { DatasetFieldServiceBean datasetFieldService; public Object getAsObject(FacesContext facesContext, UIComponent component, String submittedValue) { - ControlledVocabularyValue cvv = datasetFieldService.findControlledVocabularyValue(new Long(submittedValue)); - return cvv; + if (submittedValue == null || submittedValue.equals("")) { + return ""; + } else { + ControlledVocabularyValue cvv = datasetFieldService.findControlledVocabularyValue(new Long(submittedValue)); + return cvv; + } } public String getAsString(FacesContext facesContext, UIComponent component, Object value) { diff --git a/src/main/java/edu/harvard/iq/dataverse/DataFile.java b/src/main/java/edu/harvard/iq/dataverse/DataFile.java index c38a0e42ce1..209fe3f0000 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataFile.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataFile.java @@ -290,7 +290,9 @@ public boolean isImage() { if ("image/fits".equalsIgnoreCase(contentType)) { return false; } - return (contentType != null && contentType.startsWith("image/")); + // a pdf file is an "image" for practical purposes (we will attempt to + // generate thumbnails and previews for them) + return (contentType != null && (contentType.startsWith("image/") || contentType.equalsIgnoreCase("application/pdf"))); } public boolean isIngestScheduled() { diff --git a/src/main/java/edu/harvard/iq/dataverse/Dataset.java b/src/main/java/edu/harvard/iq/dataverse/Dataset.java index cc459605175..d6b09208acc 100644 --- a/src/main/java/edu/harvard/iq/dataverse/Dataset.java +++ b/src/main/java/edu/harvard/iq/dataverse/Dataset.java @@ -187,6 +187,10 @@ public Path getFileSystemDirectory() { public String getCitation() { return getCitation(false, getLatestVersion()); } + + public String getCitation(DatasetVersion version){ + return version.getCitation(); + } public String getCitation(boolean isOnlineVersion, DatasetVersion version) { return version.getCitation(isOnlineVersion); diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetField.java b/src/main/java/edu/harvard/iq/dataverse/DatasetField.java index 9f798a8fe1d..8b2397ccd8e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetField.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetField.java @@ -27,9 +27,11 @@ import javax.persistence.ManyToOne; import javax.persistence.OneToMany; import javax.persistence.OrderBy; +import javax.persistence.Transient; import org.apache.commons.lang.StringUtils; @Entity +@ValidateDatasetFieldType public class DatasetField implements Serializable { private static final long serialVersionUID = 1L; @@ -49,8 +51,10 @@ public static DatasetField createNewEmptyDatasetField(DatasetFieldType dsfType, dsfv.setDatasetVersion(dsv); return dsfv; } - - public static DatasetField createNewEmptyDatasetField(DatasetFieldType dsfType, DatasetFieldCompoundValue compoundValue) { + + // originally this was an overloaded method, but we renamed it to get around an issue with Bean Validation + // (that looked t overloaded methods, when it meant to look at overriden methods + public static DatasetField createNewEmptyChildDatasetField(DatasetFieldType dsfType, DatasetFieldCompoundValue compoundValue) { DatasetField dsfv = createNewEmptyDatasetField(dsfType); dsfv.setParentDatasetFieldCompoundValue(compoundValue); return dsfv; @@ -213,7 +217,7 @@ public String getDisplayValue() { } return returnString; } - + public List getValues() { List returnList = new ArrayList(); if (!datasetFieldValues.isEmpty()) { @@ -222,7 +226,9 @@ public List getValues() { } } else { for (ControlledVocabularyValue cvv : controlledVocabularyValues) { - returnList.add(cvv.getStrValue()); + if (cvv != null && cvv.getStrValue() != null){ + returnList.add(cvv.getStrValue()); + } } } return returnList; @@ -247,6 +253,18 @@ public boolean isEmpty() { return true; } + + + @Transient private String validationMessage; + + public String getValidationMessage() { + return validationMessage; + } + + public void setValidationMessage(String validationMessage) { + this.validationMessage = validationMessage; + } + @Override public int hashCode() { @@ -277,7 +295,9 @@ public DatasetField copy(DatasetVersion version) { return copy(version, null); } - public DatasetField copy(DatasetFieldCompoundValue parent) { + // originally this was an overloaded method, but we renamed it to get around an issue with Bean Validation + // (that looked t overloaded methods, when it meant to look at overriden methods + public DatasetField copyChild(DatasetFieldCompoundValue parent) { return copy(null, parent); } diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldCompoundValue.java b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldCompoundValue.java index 06780fa9a52..fc0a8bc85d8 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldCompoundValue.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldCompoundValue.java @@ -38,7 +38,7 @@ public static DatasetFieldCompoundValue createNewEmptyDatasetFieldCompoundValue( compoundValue.setParentDatasetField(dsf); for (DatasetFieldType dsfType : dsf.getDatasetFieldType().getChildDatasetFieldTypes()) { - compoundValue.getChildDatasetFields().add( DatasetField.createNewEmptyDatasetField(dsfType, compoundValue)); + compoundValue.getChildDatasetFields().add( DatasetField.createNewEmptyChildDatasetField(dsfType, compoundValue)); } return compoundValue; @@ -122,7 +122,7 @@ public DatasetFieldCompoundValue copy(DatasetField parent) { compoundValue.setDisplayOrder(displayOrder); for (DatasetField subField : childDatasetFields) { - compoundValue.getChildDatasetFields().add(subField.copy(compoundValue)); + compoundValue.getChildDatasetFields().add(subField.copyChild(compoundValue)); } return compoundValue; diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldType.java b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldType.java index 39a29a2a871..5ac89a93b52 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldType.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldType.java @@ -186,7 +186,7 @@ public ControlledVocabularyValue getControlledVocabularyValue( String strValue ) throw new IllegalStateException("getControlledVocabularyValue() called on a non-controlled vocabulary type."); } if ( controlledVocabularyValuesByStrValue == null ) { - controlledVocabularyValuesByStrValue = new TreeMap<>(); + controlledVocabularyValuesByStrValue = new TreeMap<>(); for ( ControlledVocabularyValue cvv : getControlledVocabularyValues() ) { controlledVocabularyValuesByStrValue.put( cvv.getStrValue(), cvv); } diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldTypeValidator.java b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldTypeValidator.java deleted file mode 100644 index 36581be3341..00000000000 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldTypeValidator.java +++ /dev/null @@ -1,53 +0,0 @@ -/* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. - */ -package edu.harvard.iq.dataverse; - -import java.text.ParseException; -import java.text.SimpleDateFormat; -import java.util.Date; -import javax.validation.ConstraintValidator; -import javax.validation.ConstraintValidatorContext; -import org.apache.commons.lang.StringUtils; - -/** - * - * @author gdurand - */ -public class DatasetFieldTypeValidator implements ConstraintValidator { - - //private String fieldType; - public void initialize(ValidateDatasetFieldType constraintAnnotation) { - //this.fieldType = constraintAnnotation.value(); - } - - public boolean isValid(DatasetFieldValue value, ConstraintValidatorContext context) { - - context.disableDefaultConstraintViolation(); // we do this so we can have different messages depending on the different issue - - - DatasetFieldType dsfType = value.getDatasetField().getDatasetFieldType(); - String fieldType = dsfType.getFieldType(); - - - if (dsfType.isRequired() && !dsfType.isControlledVocabulary() && StringUtils.isBlank(value.getValue())) { - context.buildConstraintViolationWithTemplate(dsfType.getDisplayName() + " is required.").addConstraintViolation(); - return false; - } - - if (fieldType.equals("date")) { - SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd"); - try { - format.parse(value.getValue()); - } catch (Exception e) { - context.buildConstraintViolationWithTemplate(dsfType.getDisplayName() + " is not a valid date.").addConstraintViolation(); - return false; - } - - } - return true; - } - -} diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldValidator.java b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldValidator.java new file mode 100644 index 00000000000..4eba785b520 --- /dev/null +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldValidator.java @@ -0,0 +1,37 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package edu.harvard.iq.dataverse; + +import javax.validation.ConstraintValidator; +import javax.validation.ConstraintValidatorContext; +import org.apache.commons.lang.StringUtils; + + +/** + * + * @author gdurand + */ +public class DatasetFieldValidator implements ConstraintValidator { + + @Override + public void initialize(ValidateDatasetFieldType constraintAnnotation) { + } + + @Override + public boolean isValid(DatasetField value, ConstraintValidatorContext context) { + context.disableDefaultConstraintViolation(); // we do this so we can have different messages depending on the different issue + + DatasetFieldType dsfType = value.getDatasetFieldType(); + + if (dsfType.isPrimitive() && dsfType.isRequired() && StringUtils.isBlank(value.getValue())) { + context.buildConstraintViolationWithTemplate(dsfType.getDisplayName() + " is required.").addConstraintViolation(); + return false; + } + + return true; + } + +} diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldValue.java b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldValue.java index ab744105a83..38f13389c7b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldValue.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldValue.java @@ -15,6 +15,7 @@ import javax.persistence.Id; import javax.persistence.JoinColumn; import javax.persistence.ManyToOne; +import javax.persistence.Transient; /** * @@ -88,6 +89,17 @@ public void setDatasetField(DatasetField datasetField) { this.datasetField = datasetField; } + @Transient private String validationMessage; + + public String getValidationMessage() { + return validationMessage; + } + + public void setValidationMessage(String validationMessage) { + this.validationMessage = validationMessage; + } + + @Override diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldValueValidator.java b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldValueValidator.java new file mode 100644 index 00000000000..73dccc3751e --- /dev/null +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldValueValidator.java @@ -0,0 +1,138 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package edu.harvard.iq.dataverse; + +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLConnection; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.Date; +import java.util.GregorianCalendar; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import javax.validation.ConstraintValidator; +import javax.validation.ConstraintValidatorContext; +import org.apache.commons.lang.StringUtils; + +/** + * + * @author gdurand + */ +public class DatasetFieldValueValidator implements ConstraintValidator { + + //private String fieldType; + public void initialize(ValidateDatasetFieldType constraintAnnotation) { + //this.fieldType = constraintAnnotation.value(); + } + + public boolean isValid(DatasetFieldValue value, ConstraintValidatorContext context) { + + context.disableDefaultConstraintViolation(); // we do this so we can have different messages depending on the different issue + + + DatasetFieldType dsfType = value.getDatasetField().getDatasetFieldType(); + String fieldType = dsfType.getFieldType(); + + + if (StringUtils.isBlank(value.getValue())) { + return true; + } + + if (fieldType.equals("date")) { + boolean valid = false; + if (!valid) { + valid = isValidDate(value.getValue(), "yyyy-MM-dd"); + } + if (!valid) { + valid = isValidDate(value.getValue(), "yyyy-MM"); + } + if (!valid ) { + valid = isValidDate(value.getValue(), "yyyy"); + } + if (!valid ) { + context.buildConstraintViolationWithTemplate(" " + dsfType.getDisplayName() + " is not a valid date.").addConstraintViolation(); + return false; + } + } + + if (fieldType.equals("float")) { + try { + Double.parseDouble(value.getValue()); + } catch (Exception e) { + context.buildConstraintViolationWithTemplate(" " + dsfType.getDisplayName() + " is not a valid number.").addConstraintViolation(); + return false; + } + } + + if (fieldType.equals("int")) { + try { + Integer.parseInt(value.getValue()); + } catch (Exception e) { + context.buildConstraintViolationWithTemplate(" " + dsfType.getDisplayName() + " is not a valid integer.").addConstraintViolation(); + return false; + } + } + + if (fieldType.equals("text") && value.getValue().length() > 255) { + context.buildConstraintViolationWithTemplate(" " + dsfType.getDisplayName() + " may not be more than 255 characters.").addConstraintViolation(); + return false; + } + + if (fieldType.equals("textbox") && value.getValue().length() > 1000) { + context.buildConstraintViolationWithTemplate(" " + dsfType.getDisplayName() + " may not be more than 1000 characters.").addConstraintViolation(); + return false; + } + if (fieldType.equals("url")) { + try { + URL url = new URL(value.getValue()); + } catch (MalformedURLException e) { + context.buildConstraintViolationWithTemplate(" " + dsfType.getDisplayName() + " is not a valid URL.").addConstraintViolation(); + return false; + } + } + + if (fieldType.equals("email")) { + Pattern p = Pattern.compile(".+@.+\\.[a-z]+"); + Matcher m = p.matcher(value.getValue()); + boolean matchFound = m.matches(); + if (!matchFound) { + context.buildConstraintViolationWithTemplate(" " + dsfType.getDisplayName() + " is not a valid email address.").addConstraintViolation(); + return false; + } + } + + return true; + } + + private boolean isValidDate(String dateString, String pattern) { + boolean valid=true; + Date date; + SimpleDateFormat sdf = new SimpleDateFormat(pattern); + sdf.setLenient(false); + try { + dateString = dateString.trim(); + date = sdf.parse(dateString); + Calendar calendar = Calendar.getInstance(); + calendar.setTime(date); + int year = calendar.get(Calendar.YEAR); + int era = calendar.get(Calendar.ERA); + if (era == GregorianCalendar.AD ) { + if ( year > 9999) { + valid=false; + } + } + }catch (ParseException e) { + valid=false; + } + if (dateString.length()>pattern.length()) { + valid=false; + } + return valid; + } + +} diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index 0c4f6854f44..6e59183392d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -26,8 +26,8 @@ import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; -import java.util.Iterator; import java.util.List; +import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import javax.ejb.EJB; @@ -44,6 +44,10 @@ import javax.json.JsonObject; import javax.json.JsonArray; import javax.json.JsonReader; +import javax.validation.ConstraintViolation; +import javax.validation.Validation; +import javax.validation.Validator; +import javax.validation.ValidatorFactory; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.methods.GetMethod; @@ -74,7 +78,7 @@ public enum DisplayMode { @EJB DataFileServiceBean datafileService; @EJB - PermissionServiceBean permissionServiceBean; + PermissionServiceBean permissionServiceBean; @EJB DataverseServiceBean dataverseService; @EJB @@ -201,44 +205,30 @@ public void init() { if (dataset.getId() != null) { // view mode for a dataset dataset = datasetService.find(dataset.getId()); if (versionId == null) { - if (canIssueUpdateCommand()){ - displayVersion = dataset.getLatestVersion(); + if (!dataset.isReleased()) { + displayVersion = dataset.getLatestVersion(); } else { - displayVersion = dataset.getReleasedVersion(); - } + displayVersion = dataset.getReleasedVersion(); + } } else { displayVersion = datasetVersionService.find(versionId); - } + } ownerId = dataset.getOwner().getId(); - //displayVersion.setDatasetFields(displayVersion.initDatasetFields()); if (dataset.getReleasedVersion() != null) { datasetNextMajorVersion = new Integer(dataset.getReleasedVersion().getVersionNumber().intValue() + 1).toString() + ".0"; datasetNextMinorVersion = new Integer(dataset.getReleasedVersion().getVersionNumber().intValue()).toString() + "." + new Integer(dataset.getReleasedVersion().getMinorVersionNumber().intValue() + 1).toString(); } - - - /* - if (!dataset.isReleased() || (dataset.isReleased() && displayVersion.equals(dataset.getLatestVersion()) && !displayVersion.isDraft())) { - displayCitation = dataset.getCitation(false, displayVersion); - } else if (dataset.isReleased() && displayVersion.isDraft()) { - displayCitation = dataset.getCitation(false, displayVersion.getMostRecentlyReleasedVersion()); - } else if (dataset.isReleased() && !displayVersion.equals(dataset.getLatestVersion())) { - displayCitation = dataset.getCitation(false, displayVersion.getLargestMinorRelease()); - } else { - displayCitation = ""; - } - */ - // show citation for current display version if draft note it on page + try { - datasetVersionUI = new DatasetVersionUI(displayVersion); - displayCitation = dataset.getCitation(false, displayVersion); - } catch (NullPointerException npe){ + datasetVersionUI = new DatasetVersionUI(displayVersion); + } catch (NullPointerException npe) { //This will happen when solr is down and will allow any link to be displayed. throw new RuntimeException("You do not have permission to view this dataset version."); // improve error handling } - + + displayCitation = dataset.getCitation(false, displayVersion); setVersionTabList(resetVersionTabList()); } else if (ownerId != null) { @@ -348,6 +338,35 @@ public void refresh(ActionEvent e) { } public String save() { + + // Validate + boolean dontSave = false; + ValidatorFactory factory = Validation.buildDefaultValidatorFactory(); + Validator validator = factory.getValidator(); + for (DatasetField dsf : editVersion.getFlatDatasetFields()) { + dsf.setValidationMessage(null); // clear out any existing validation message + Set> constraintViolations = validator.validate(dsf); + for (ConstraintViolation constraintViolation : constraintViolations) { + FacesContext.getCurrentInstance().addMessage(null, new FacesMessage(FacesMessage.SEVERITY_ERROR, "Validation Error", constraintViolation.getMessage())); + dsf.setValidationMessage(constraintViolation.getMessage()); + dontSave = true; + break; // currently only support one message, so we can break out of the loop after the first constraint violation + } + for (DatasetFieldValue dsfv : dsf.getDatasetFieldValues()) { + dsfv.setValidationMessage(null); // clear out any existing validation message + Set> constraintViolations2 = validator.validate(dsfv); + for (ConstraintViolation constraintViolation : constraintViolations2) { + FacesContext.getCurrentInstance().addMessage(null, new FacesMessage(FacesMessage.SEVERITY_ERROR, "Validation Error", constraintViolation.getMessage())); + dsfv.setValidationMessage(constraintViolation.getMessage()); + dontSave = true; + break; // currently only support one message, so we can break out of the loop after the first constraint violation + } + } + } + if (dontSave) { + return ""; + } + dataset.setOwner(dataverseService.find(ownerId)); //TODO get real application-wide protocol/authority dataset.setProtocol("doi"); @@ -374,7 +393,7 @@ public String save() { if (dataset.getFileSystemDirectory() != null && Files.exists(dataset.getFileSystemDirectory())) { for (DataFile dFile : newFiles) { String tempFileLocation = getFilesTempDirectory() + "/" + dFile.getFileSystemName(); - + // These are all brand new files, so they should all have // one filemetadata total. You do NOT want to use // getLatestFilemetadata() here - because it relies on @@ -382,7 +401,7 @@ public String save() { // Which may not have been persisted yet. // -- L.A. 4.0 beta. FileMetadata fileMetadata = dFile.getFileMetadatas().get(0); - String fileName = fileMetadata.getLabel(); + String fileName = fileMetadata.getLabel(); //boolean ingestedAsTabular = false; boolean metadataExtracted = false; @@ -477,7 +496,7 @@ public String save() { } } - return "/dataset.xhtml?id=" + dataset.getId() + "&faces-redirect=true"; + return "/dataset.xhtml?id=" + dataset.getId() + "&versionId=" + dataset.getLatestVersion().getId() + "&faces-redirect=true"; } private String getFilesTempDirectory() { @@ -621,7 +640,7 @@ public void handleFileUpload(FileUploadEvent event) { fmd.setDataFile(dFile); dFile.getFileMetadatas().add(fmd); - + if (editVersion.getFileMetadatas() == null) { editVersion.setFileMetadatas(new ArrayList()); } @@ -681,8 +700,8 @@ public List getVersionTabList() { public void setVersionTabList(List versionTabList) { this.versionTabList = versionTabList; } - - private boolean canIssueUpdateCommand(){ + + private boolean canIssueUpdateCommand() { try { if (permissionServiceBean.on(dataset).canIssueCommand("UpdateDatasetCommand")) { return true; @@ -694,21 +713,21 @@ private boolean canIssueUpdateCommand(){ } return false; } - + private List resetVersionTabList() { - List retList = new ArrayList(); + List retList = new ArrayList(); - if (canIssueUpdateCommand()) { - return dataset.getVersions(); - } else { - for(DatasetVersion version: dataset.getVersions()){ - if (version.isReleased()){ - retList.add(version); - } + if (canIssueUpdateCommand()) { + return dataset.getVersions(); + } else { + for (DatasetVersion version : dataset.getVersions()) { + if (version.isReleased()) { + retList.add(version); } - return retList; - } + return retList; + + } } - + } diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java index 12d5d0699c8..d983f558630 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java @@ -370,6 +370,10 @@ public void setDatasetAuthors( List authors ) { // FIXME add the authores to the relevant fields } + public String getCitation(){ + return getCitation(false); + } + public String getCitation(boolean isOnlineVersion) { @@ -425,8 +429,13 @@ public String getCitation(boolean isOnlineVersion) { } str += " " + rootDataverseName + " Dataverse"; } - - if (this.getVersionNumber() != null) { + if (this.isDraft()){ + if (!StringUtil.isEmpty(str)) { + str += ", "; + } + str += " DRAFT VERSION "; + + } else if (this.getVersionNumber() != null) { if (!StringUtil.isEmpty(str)) { str += ", "; } @@ -516,7 +525,7 @@ private DatasetField initDatasetField(DatasetField dsf) { } if (add) { - cv.getChildDatasetFields().add(DatasetField.createNewEmptyDatasetField(dsfType, cv)); + cv.getChildDatasetFields().add(DatasetField.createNewEmptyChildDatasetField(dsfType, cv)); } } } diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersionUI.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersionUI.java index 859d2ed36ec..9e715dd2ce8 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersionUI.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersionUI.java @@ -312,7 +312,7 @@ private DatasetField initDatasetField(DatasetField dsf) { } if (add) { - cv.getChildDatasetFields().add(DatasetField.createNewEmptyDatasetField(dsfType, cv)); + cv.getChildDatasetFields().add(DatasetField.createNewEmptyChildDatasetField(dsfType, cv)); } } diff --git a/src/main/java/edu/harvard/iq/dataverse/DataversePage.java b/src/main/java/edu/harvard/iq/dataverse/DataversePage.java index fde6c4dcec1..e8b357c7859 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataversePage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataversePage.java @@ -25,6 +25,8 @@ import java.util.ArrayList; import java.util.Date; import java.util.logging.Logger; +import javax.faces.component.UIComponent; +import javax.faces.component.UIInput; import org.primefaces.model.DualListModel; /** @@ -259,4 +261,23 @@ public String getMetadataBlockPreview(MetadataBlock mdb, int numberOfItems) { return mdbPreview.toString(); } + public void validateAlias(FacesContext context, UIComponent toValidate, Object value) { + String alias = (String) value; + boolean aliasFound = false; + Dataverse dv = dataverseService.findByAlias(alias); + if (editMode == DataversePage.EditMode.CREATE) { + if (dv != null) { + aliasFound = true; + } + } else { + if (dv != null && !dv.getId().equals(dataverse.getId())) { + aliasFound = true; + } + } + if (aliasFound) { + ((UIInput) toValidate).setValid(false); + FacesMessage message = new FacesMessage("This Alias is already taken."); + context.addMessage(toValidate.getClientId(context), message); + } + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/DataverseUserPage.java b/src/main/java/edu/harvard/iq/dataverse/DataverseUserPage.java index ae2263c5fc7..de4d262cd3a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataverseUserPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataverseUserPage.java @@ -29,7 +29,7 @@ public class DataverseUserPage implements java.io.Serializable { public enum EditMode { - CREATE, INFO, EDIT, CHANGE, FORGOT + CREATE, EDIT, CHANGE, FORGOT }; @Inject @@ -253,7 +253,7 @@ public String save() { return "/dataverse.xhtml?faces-redirect=true;"; } - editMode = EditMode.INFO; + editMode = null; return null; } @@ -262,13 +262,13 @@ public String cancel() { return "/dataverse.xhtml?faces-redirect=true;"; } - editMode = EditMode.INFO; + editMode = null; return null; } public void submit(ActionEvent e) { updatePassword(dataverseUser.getUserName()); - editMode = EditMode.INFO; + editMode = null; } public String remove(Long notificationId) { diff --git a/src/main/java/edu/harvard/iq/dataverse/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/IndexServiceBean.java index 22d0e8cfbbd..98ce11c8ae1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/IndexServiceBean.java @@ -6,6 +6,8 @@ import edu.harvard.iq.dataverse.search.IndexableObject; import edu.harvard.iq.dataverse.util.FileUtil; import java.io.IOException; +import java.sql.Timestamp; +import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; @@ -139,10 +141,15 @@ public String indexDataverse(Dataverse dataverse) { if (dataverse.isReleased()) { solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, PUBLISHED_STRING); solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, dataverse.getPublicationDate()); + solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE_SEARCHABLE_TEXT, convertToFriendlyDate(dataverse.getPublicationDate())); solrInputDocument.addField(SearchFields.PERMS, publicGroupString); - } else if (dataverse.getCreator() != null) { + } else if (dataverse.getCreator() != null) { //@todo: do we need this check still solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, UNPUBLISHED_STRING); solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, dataverse.getCreateDate()); + solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE_SEARCHABLE_TEXT, convertToFriendlyDate(dataverse.getCreateDate())); + } + + if (dataverse.getCreator() != null) { solrInputDocument.addField(SearchFields.PERMS, groupPerUserPrefix + dataverse.getCreator().getId()); /** * @todo: replace this fake version of granting users access to @@ -229,7 +236,7 @@ public String indexDataset(Dataset dataset) { */ String solrIdPublishedStudy = "dataset_" + dataset.getId(); StringBuilder sb = new StringBuilder(); - sb.append("rationale:\n"); + sb.append("rationale:\n"); List versions = dataset.getVersions(); for (DatasetVersion datasetVersion : versions) { Long versionDatabaseId = datasetVersion.getId(); @@ -243,6 +250,16 @@ public String indexDataset(Dataset dataset) { sb.append("- semanticVersion-STATE: " + semanticVersion + "-" + versionState + "\n"); sb.append("- isWorkingCopy: " + versionIsWorkingCopy + "\n"); sb.append("- isReleased: " + versionIsReleased + "\n"); + List fileMetadatas = datasetVersion.getFileMetadatas(); + List fileInfo = new ArrayList<>(); + for (FileMetadata fileMetadata : fileMetadatas) { + fileInfo.add(fileMetadata.getDataFile().getId() + ":" + fileMetadata.getLabel()); + } + int numFiles = 0; + if (fileMetadatas != null) { + numFiles = fileMetadatas.size(); + } + sb.append("- files: " + numFiles + " " + fileInfo.toString() + "\n"); } DatasetVersion latestVersion = dataset.getLatestVersion(); String latestVersionState = latestVersion.getVersionState().name(); @@ -327,14 +344,14 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset) { solrInputDocument.addField(SearchFields.ENTITY_ID, dataset.getId()); solrInputDocument.addField(SearchFields.TYPE, "datasets"); - Date sortByDate = new Date(); + Date datasetSortByDate = new Date(); Date majorVersionReleaseDate = dataset.getMostRecentMajorVersionReleaseDate(); if (majorVersionReleaseDate != null) { if (true) { String msg = "major release date found: " + majorVersionReleaseDate.toString(); logger.info(msg); } - sortByDate = majorVersionReleaseDate; + datasetSortByDate = majorVersionReleaseDate; } else { solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, UNPUBLISHED_STRING); Date createDate = dataset.getCreateDate(); @@ -343,14 +360,15 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset) { String msg = "can't find major release date, using create date: " + createDate; logger.info(msg); } - sortByDate = createDate; + datasetSortByDate = createDate; } else { String msg = "can't find major release date or create date, using \"now\""; logger.info(msg); - sortByDate = new Date(); + datasetSortByDate = new Date(); } } - solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, sortByDate); + solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, datasetSortByDate); + solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE_SEARCHABLE_TEXT, convertToFriendlyDate(datasetSortByDate)); if (state.equals(indexableDataset.getDatasetState().PUBLISHED)) { solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, PUBLISHED_STRING); @@ -358,21 +376,21 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset) { solrInputDocument.addField(SearchFields.PERMS, publicGroupString); } else if (state.equals(indexableDataset.getDatasetState().WORKING_COPY)) { solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, DRAFT_STRING); - DataverseUser creator = dataset.getCreator(); - if (creator != null) { - solrInputDocument.addField(SearchFields.PERMS, groupPerUserPrefix + creator.getId()); - /** - * @todo: replace this fake version of granting users access to - * dataverses with the real thing, when it's available in the - * app - */ - if (creator.getUserName().equals("pete")) { - // figure out if cathy is around - DataverseUser cathy = dataverseUserServiceBean.findByUserName("cathy"); - if (cathy != null) { - // let cathy see all of pete's dataverses - solrInputDocument.addField(SearchFields.PERMS, groupPerUserPrefix + cathy.getId()); - } + } + + DataverseUser creator = dataset.getCreator(); + if (creator != null) { + solrInputDocument.addField(SearchFields.PERMS, groupPerUserPrefix + creator.getId()); + /** + * @todo: replace this fake version of granting users access to + * dataverses with the real thing, when it's available in the app + */ + if (creator.getUserName().equals("pete")) { + // figure out if cathy is around + DataverseUser cathy = dataverseUserServiceBean.findByUserName("cathy"); + if (cathy != null) { + // let cathy see all of pete's dataverses + solrInputDocument.addField(SearchFields.PERMS, groupPerUserPrefix + cathy.getId()); } } } @@ -385,20 +403,10 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset) { addDatasetReleaseDateToSolrDoc(solrInputDocument, dataset); DatasetVersion datasetVersion = indexableDataset.getDatasetVersion(); + String parentDatasetTitle = "TBD"; if (datasetVersion != null) { - - String citation = null; - try { - citation = dataset.getCitation(false, datasetVersion); - if (citation != null) { - solrInputDocument.addField(SearchFields.CITATION, citation); - } - - } catch (NullPointerException ex) { - logger.info("Caught exception trying to get citation for dataset " + dataset.getId() + ". : " + ex); - } - + solrInputDocument.addField(SearchFields.DATASET_VERSION_ID, datasetVersion.getId()); for (DatasetField dsf : datasetVersion.getFlatDatasetFields()) { @@ -448,6 +456,11 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset) { solrInputDocument.addField(SearchFields.AFFILIATION, dsf.getValues()); } else if (dsf.getDatasetFieldType().getName().equals("title")) { // datasets have titles not names but index title under name as well so we can sort datasets by name along dataverses and files + List possibleTitles = dsf.getValues(); + String firstTitle = possibleTitles.get(0); + if (firstTitle != null) { + parentDatasetTitle = firstTitle; + } solrInputDocument.addField(SearchFields.NAME_SORT, dsf.getValues()); } if (dsfType.isControlledVocabulary()) { @@ -551,14 +564,18 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset) { docs.add(solrInputDocument); - List files = dataset.getFiles(); - for (DataFile dataFile : files) { + if (datasetVersion != null) { + List fileMetadatas = datasetVersion.getFileMetadatas(); + for (FileMetadata fileMetadata : fileMetadatas) { SolrInputDocument datafileSolrInputDocument = new SolrInputDocument(); - datafileSolrInputDocument.addField(SearchFields.ID, "datafile_" + dataFile.getId()); - datafileSolrInputDocument.addField(SearchFields.ENTITY_ID, dataFile.getId()); + Long fileEntityId = fileMetadata.getDataFile().getId(); + /** + * @todo: should this sometimes end with "_draft" like datasets do? + */ + datafileSolrInputDocument.addField(SearchFields.ID, "datafile_" + fileEntityId); + datafileSolrInputDocument.addField(SearchFields.ENTITY_ID, fileEntityId); datafileSolrInputDocument.addField(SearchFields.TYPE, "files"); - FileMetadata fileMetadata = dataFile.getFileMetadata(); String filenameCompleteFinal = ""; if (fileMetadata != null) { String filenameComplete = fileMetadata.getLabel(); @@ -586,34 +603,76 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset) { datafileSolrInputDocument.addField(SearchFields.NAME_SORT, filenameCompleteFinal); datafileSolrInputDocument.addField(SearchFields.FILE_NAME, filenameCompleteFinal); - datafileSolrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, sortByDate); + /** + * for rules on sorting files see + * https://docs.google.com/a/harvard.edu/document/d/1DWsEqT8KfheKZmMB3n_VhJpl9nIxiUjai_AIQPAjiyA/edit?usp=sharing + * via https://redmine.hmdc.harvard.edu/issues/3701 + */ + Date fileSortByDate = new Date(); + DataFile datafile = fileMetadata.getDataFile(); + if (datafile != null) { + boolean fileHasBeenReleased = datafile.isReleased(); + if (fileHasBeenReleased) { + logger.info("indexing file with filePublicationTimestamp. " + fileMetadata.getId() + " (file id " + datafile.getId() + ")"); + Timestamp filePublicationTimestamp = datafile.getPublicationDate(); + if (filePublicationTimestamp != null) { + fileSortByDate = filePublicationTimestamp; + } else { + String msg = "filePublicationTimestamp was null for fileMetadata id " + fileMetadata.getId() + " (file id " + datafile.getId() + ")"; + logger.info(msg); + } + } else { + logger.info("indexing file with fileCreateTimestamp. " + fileMetadata.getId() + " (file id " + datafile.getId() + ")"); + Timestamp fileCreateTimestamp = datafile.getCreateDate(); + if (fileCreateTimestamp != null) { + fileSortByDate = fileCreateTimestamp; + } else { + String msg = "fileCreateTimestamp was null for fileMetadata id " + fileMetadata.getId() + " (file id " + datafile.getId() + ")"; + logger.info(msg); + } + } + } + if (fileSortByDate == null) { + if (datasetSortByDate != null) { + logger.info("fileSortByDate was null, assigning datasetSortByDate"); + fileSortByDate = datasetSortByDate; + } else { + logger.info("fileSortByDate and datasetSortByDate were null, assigning 'now'"); + fileSortByDate = new Date(); + } + } + datafileSolrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, fileSortByDate); + datafileSolrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE_SEARCHABLE_TEXT, convertToFriendlyDate(fileSortByDate)); + if (majorVersionReleaseDate == null) { datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, UNPUBLISHED_STRING); } if (indexableDataset.getDatasetState().equals(indexableDataset.getDatasetState().PUBLISHED)) { datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, PUBLISHED_STRING); datafileSolrInputDocument.addField(SearchFields.PERMS, publicGroupString); + addDatasetReleaseDateToSolrDoc(datafileSolrInputDocument, dataset); } else if (indexableDataset.getDatasetState().equals(indexableDataset.getDatasetState().WORKING_COPY)) { datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, DRAFT_STRING); - DataverseUser creator = dataFile.getOwner().getCreator(); - if (creator != null) { - datafileSolrInputDocument.addField(SearchFields.PERMS, groupPerUserPrefix + creator.getId()); - /** - * @todo: replace this fake version of granting users access - * to dataverses with the real thing, when it's available in - * the app - */ - if (creator.getUserName().equals("pete")) { - // figure out if cathy is around - DataverseUser cathy = dataverseUserServiceBean.findByUserName("cathy"); - if (cathy != null) { - // let cathy see all of pete's dataverses - datafileSolrInputDocument.addField(SearchFields.PERMS, groupPerUserPrefix + cathy.getId()); - } + } + + if (creator != null) { + datafileSolrInputDocument.addField(SearchFields.PERMS, groupPerUserPrefix + creator.getId()); + /** + * @todo: replace this fake version of granting users access to + * dataverses with the real thing, when it's available in the + * app + */ + if (creator.getUserName().equals("pete")) { + // figure out if cathy is around + DataverseUser cathy = dataverseUserServiceBean.findByUserName("cathy"); + if (cathy != null) { + // let cathy see all of pete's dataverses + datafileSolrInputDocument.addField(SearchFields.PERMS, groupPerUserPrefix + cathy.getId()); } } } + /** * @todo: remove this fake "has access to all data" group */ @@ -623,27 +682,26 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset) { // "PDF File" instead of "application/pdf", "MS Excel" instead of // "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" (!), etc., // if available: - datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_MIME, dataFile.getFriendlyType()); - datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_SEARCHABLE, dataFile.getFriendlyType()); + datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_MIME, fileMetadata.getDataFile().getFriendlyType()); + datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_SEARCHABLE, fileMetadata.getDataFile().getFriendlyType()); // For the file type facets, we have a property file that maps mime types // to facet-friendly names; "application/fits" should become "FITS", etc.: - datafileSolrInputDocument.addField(SearchFields.FILE_TYPE, FileUtil.getFacetFileType(dataFile)); - datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_SEARCHABLE, FileUtil.getFacetFileType(dataFile)); - datafileSolrInputDocument.addField(SearchFields.DESCRIPTION, dataFile.getDescription()); + datafileSolrInputDocument.addField(SearchFields.FILE_TYPE, FileUtil.getFacetFileType(fileMetadata.getDataFile())); + datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_SEARCHABLE, FileUtil.getFacetFileType(fileMetadata.getDataFile())); + datafileSolrInputDocument.addField(SearchFields.DESCRIPTION, fileMetadata.getDescription()); datafileSolrInputDocument.addField(SearchFields.SUBTREE, dataversePaths); // datafileSolrInputDocument.addField(SearchFields.HOST_DATAVERSE, dataFile.getOwner().getOwner().getName()); // datafileSolrInputDocument.addField(SearchFields.PARENT_NAME, dataFile.getDataset().getTitle()); - datafileSolrInputDocument.addField(SearchFields.PARENT_ID, dataFile.getOwner().getId()); - if (!dataFile.getOwner().getLatestVersion().getTitle().isEmpty()) { - datafileSolrInputDocument.addField(SearchFields.PARENT_NAME, dataFile.getOwner().getLatestVersion().getTitle()); - } + datafileSolrInputDocument.addField(SearchFields.PARENT_ID, fileMetadata.getDataFile().getOwner().getId()); + + datafileSolrInputDocument.addField(SearchFields.PARENT_NAME, parentDatasetTitle); // If this is a tabular data file -- i.e., if there are data // variables associated with this file, we index the variable // names and labels: - if (dataFile.isTabularData()) { - List variables = dataFile.getDataTable().getDataVariables(); + if (fileMetadata.getDataFile().isTabularData()) { + List variables = fileMetadata.getDataFile().getDataTable().getDataVariables(); String variableNamesToIndex = null; String variableLabelsToIndex = null; for (DataVariable var : variables) { @@ -685,7 +743,7 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset) { // And if the file has indexable file-level metadata associated // with it, we'll index that too: - List fileMetadataFieldValues = dataFile.getFileMetadataFieldValues(); + List fileMetadataFieldValues = fileMetadata.getDataFile().getFileMetadataFieldValues(); if (fileMetadataFieldValues != null && fileMetadataFieldValues.size() > 0) { for (int j = 0; j < fileMetadataFieldValues.size(); j++) { @@ -703,6 +761,7 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset) { docs.add(datafileSolrInputDocument); } + } /** * @todo allow for configuration of hostname and port @@ -911,4 +970,14 @@ public String removeDatasetDraftFromIndex(String doomed) { return response; } + public String convertToFriendlyDate(Date dateAsDate) { + if (dateAsDate == null) { + dateAsDate = new Date(); + } + // using DateFormat.MEDIUM for May 5, 2014 to match what's in DVN 3.x + DateFormat format = DateFormat.getDateInstance(DateFormat.MEDIUM); + String friendlyDate = format.format(dateAsDate); + return friendlyDate; + } + } diff --git a/src/main/java/edu/harvard/iq/dataverse/SearchIncludeFragment.java b/src/main/java/edu/harvard/iq/dataverse/SearchIncludeFragment.java index 70c2191dff7..a8e83ba89dc 100644 --- a/src/main/java/edu/harvard/iq/dataverse/SearchIncludeFragment.java +++ b/src/main/java/edu/harvard/iq/dataverse/SearchIncludeFragment.java @@ -15,6 +15,7 @@ import javax.faces.view.ViewScoped; import javax.inject.Inject; import javax.inject.Named; +import org.apache.commons.lang.StringUtils; @ViewScoped @Named("SearchIncludeFragment") @@ -29,6 +30,8 @@ public class SearchIncludeFragment { @EJB DatasetServiceBean datasetService; @EJB + DatasetVersionServiceBean datasetVersionService; + @EJB DataFileServiceBean dataFileService; @EJB PermissionServiceBean permissionService; @@ -119,7 +122,7 @@ public class SearchIncludeFragment { * * see also https://trello.com/c/jmry3BJR/28-browse-dataverses */ - public String searchRedirect(String stayOnDataversePage) { + public String searchRedirect(String dataverseRedirectPage) { /** * These are our decided-upon search/browse rules, the way we expect * users to search/browse and how we want the app behave: @@ -152,18 +155,19 @@ public String searchRedirect(String stayOnDataversePage) { * selections and what page you are on should be preserved. * */ - if (stayOnDataversePage.equals("true")) { - String optionalDataverseScope = ""; - if (!dataverse.getId().equals(dataverseService.findRootDataverse().getId())) { - optionalDataverseScope = "&id=" + dataverse.getId(); - } - return "dataverse.xhtml?faces-redirect=true&q=" + query + optionalDataverseScope ; - } else { - return "FIXME"; - } + + dataverseRedirectPage = StringUtils.isBlank(dataverseRedirectPage) ? "dataverse.xhtml" : dataverseRedirectPage; + String optionalDataverseScope = dataverse.getId().equals(dataverseService.findRootDataverse().getId()) ? "" : "&id=" + dataverse.getId(); + + return dataverseRedirectPage + "?faces-redirect=true&q=" + query + optionalDataverseScope ; + } public void search() { + search(false); + } + + public void search(boolean onlyDataRelatedToMe) { logger.info("search called"); // wildcard/browse (*) unless user supplies a query @@ -262,8 +266,8 @@ public void search() { // } else { // publishedToggle = SearchServiceBean.PublishedToggle.PUBLISHED; // } - solrQueryResponse = searchService.search(session.getUser(), dataverse, queryToPassToSolr, filterQueriesFinal, sortField, sortOrder, paginationStart, publishedToggle); - solrQueryResponseAllTypes = searchService.search(session.getUser(), dataverse, queryToPassToSolr, filterQueriesFinalAllTypes, sortField, sortOrder, paginationStart, publishedToggle); + solrQueryResponse = searchService.search(session.getUser(), dataverse, queryToPassToSolr, filterQueriesFinal, sortField, sortOrder, paginationStart, onlyDataRelatedToMe); + solrQueryResponseAllTypes = searchService.search(session.getUser(), dataverse, queryToPassToSolr, filterQueriesFinalAllTypes, sortField, sortOrder, paginationStart, onlyDataRelatedToMe); } catch (EJBException ex) { Throwable cause = ex; StringBuilder sb = new StringBuilder(); @@ -315,19 +319,15 @@ public void search() { solrSearchResult.setStatus(getCreatedOrReleasedDate(dataverseInCard, solrSearchResult.getReleaseOrCreateDate())); } } else if (solrSearchResult.getType().equals("datasets")) { - Dataset dataset = datasetService.find(solrSearchResult.getEntityId()); - if (dataset != null) { - String citation = null; - try { - citation = dataset.getCitation(); - } catch (NullPointerException ex) { - logger.info("Caught exception trying to get citation for dataset " + dataset.getId() + ". : " + ex); - } + Long datasetVersionId = solrSearchResult.getDatasetVersionId(); + if (datasetVersionId != null) { + DatasetVersion datasetVersion = datasetVersionService.find(datasetVersionId); + if (datasetVersion != null) { + String citation = datasetVersion.getCitation(); + if (citation != null) { solrSearchResult.setCitation(citation); - String solrId = solrSearchResult.getId(); - solrSearchResult.setStatus(solrId + " " + getCreatedOrReleasedDate(dataset, solrSearchResult.getReleaseOrCreateDate())); - } else { - logger.info("couldn't find dataset id " + solrSearchResult.getEntityId() + ". Stale Solr data? Time to re-index?"); + } + } } } else if (solrSearchResult.getType().equals("files")) { DataFile dataFile = dataFileService.find(solrSearchResult.getEntityId()); diff --git a/src/main/java/edu/harvard/iq/dataverse/SearchServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/SearchServiceBean.java index 992ca7f68b7..74c3e699b35 100644 --- a/src/main/java/edu/harvard/iq/dataverse/SearchServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/SearchServiceBean.java @@ -63,7 +63,10 @@ public enum PublishedToggle { }; public SolrQueryResponse search(DataverseUser dataverseUser, Dataverse dataverse, String query, List filterQueries, String sortField, String sortOrder, int paginationStart, PublishedToggle publishedToggle) { -// if (publishedToggle.equals(PublishedToggle.PUBLISHED)) { + return search( dataverseUser, dataverse, query, filterQueries, sortField, sortOrder, paginationStart, false); + } + + public SolrQueryResponse search(DataverseUser dataverseUser, Dataverse dataverse, String query, List filterQueries, String sortField, String sortOrder, int paginationStart, boolean onlyDatatRelatedToMe) {// if (publishedToggle.equals(PublishedToggle.PUBLISHED)) {// if (publishedToggle.equals(PublishedToggle.PUBLISHED)) { // filterQueries.add(SearchFields.PUBLICATION_STATUS + ":" + IndexServiceBean.getPUBLISHED_STRING()); // } else { // filterQueries.add(SearchFields.PUBLICATION_STATUS + ":" + IndexServiceBean.getUNPUBLISHED_STRING()); @@ -88,7 +91,6 @@ public SolrQueryResponse search(DataverseUser dataverseUser, Dataverse dataverse Map solrFieldsToHightlightOnMap = new HashMap<>(); solrFieldsToHightlightOnMap.put(SearchFields.NAME, "Name"); solrFieldsToHightlightOnMap.put(SearchFields.AFFILIATION, "Affiliation"); - solrFieldsToHightlightOnMap.put(SearchFields.CITATION, "Citation"); solrFieldsToHightlightOnMap.put(SearchFields.FILE_TYPE_MIME, "File Type"); solrFieldsToHightlightOnMap.put(SearchFields.DESCRIPTION, "Description"); /** @@ -135,16 +137,16 @@ public SolrQueryResponse search(DataverseUser dataverseUser, Dataverse dataverse * https://access.redhat.com/site/documentation/en-US/Red_Hat_Enterprise_Linux/6/html/Deployment_Guide/ch-Managing_Users_and_Groups.html#s2-users-groups-private-groups */ String publicPlusUserPrivateGroup = "(" - + publicOnly - + " OR {!join from=" + SearchFields.GROUPS + " to=" + SearchFields.PERMS + "}id:" + IndexServiceBean.getGroupPerUserPrefix() + dataverseUser.getId() + ")"; + + (onlyDatatRelatedToMe ? "" : (publicOnly + " OR ")) + + "{!join from=" + SearchFields.GROUPS + " to=" + SearchFields.PERMS + "}id:" + IndexServiceBean.getGroupPerUserPrefix() + dataverseUser.getId() + ")"; /** * @todo: replace this with a real group... look up the user's * groups (once you can) */ if (dataverseUser.getPosition().equals("Signals Intelligence")) { String publicPlusUserPrivateGroupPlusNSA = "(" - + publicOnly - + " OR {!join from=" + SearchFields.GROUPS + " to=" + SearchFields.PERMS + "}id:" + IndexServiceBean.getGroupPerUserPrefix() + dataverseUser.getId() + + (onlyDatatRelatedToMe ? "" : (publicOnly + " OR ")) + + "{!join from=" + SearchFields.GROUPS + " to=" + SearchFields.PERMS + "}id:" + IndexServiceBean.getGroupPerUserPrefix() + dataverseUser.getId() + " OR {!join from=" + SearchFields.GROUPS + " to=" + SearchFields.PERMS + "}id:" + IndexServiceBean.getGroupPrefix() + IndexServiceBean.getTmpNsaGroupId() + ")"; permissionFilterQuery = publicPlusUserPrivateGroupPlusNSA; @@ -273,10 +275,12 @@ public SolrQueryResponse search(DataverseUser dataverseUser, Dataverse dataverse String nameSort = (String) solrDocument.getFieldValue(SearchFields.NAME_SORT); // ArrayList titles = (ArrayList) solrDocument.getFieldValues(SearchFields.TITLE); String title = (String) solrDocument.getFieldValue(titleSolrField); + Long datasetVersionId = (Long) solrDocument.getFieldValue(SearchFields.DATASET_VERSION_ID); // logger.info("titleSolrField: " + titleSolrField); // logger.info("title: " + title); String filetype = (String) solrDocument.getFieldValue(SearchFields.FILE_TYPE_MIME); Date release_or_create_date = (Date) solrDocument.getFieldValue(SearchFields.RELEASE_OR_CREATE_DATE); + String dateToDisplayOnCard = (String) solrDocument.getFirstValue(SearchFields.RELEASE_OR_CREATE_DATE_SEARCHABLE_TEXT); List matchedFields = new ArrayList<>(); List highlights = new ArrayList<>(); Map highlightsMap = new HashMap<>(); @@ -323,6 +327,7 @@ public SolrQueryResponse search(DataverseUser dataverseUser, Dataverse dataverse solrSearchResult.setType(type); solrSearchResult.setNameSort(nameSort); solrSearchResult.setReleaseOrCreateDate(release_or_create_date); + solrSearchResult.setDateToDisplayOnCard(dateToDisplayOnCard); solrSearchResult.setMatchedFields(matchedFields); solrSearchResult.setHighlightsAsList(highlights); solrSearchResult.setHighlightsMap(highlightsMap); @@ -335,6 +340,7 @@ public SolrQueryResponse search(DataverseUser dataverseUser, Dataverse dataverse } else if (type.equals("datasets")) { String datasetDescription = (String) solrDocument.getFieldValue(SearchFields.DATASET_DESCRIPTION); solrSearchResult.setDescriptionNoSnippet(datasetDescription); + solrSearchResult.setDatasetVersionId(datasetVersionId); if (title != null) { // solrSearchResult.setTitle((String) titles.get(0)); solrSearchResult.setTitle((String) title); diff --git a/src/main/java/edu/harvard/iq/dataverse/SolrSearchResult.java b/src/main/java/edu/harvard/iq/dataverse/SolrSearchResult.java index 5f78bc9e8e6..89f313e5d32 100644 --- a/src/main/java/edu/harvard/iq/dataverse/SolrSearchResult.java +++ b/src/main/java/edu/harvard/iq/dataverse/SolrSearchResult.java @@ -24,6 +24,7 @@ public class SolrSearchResult { private String nameSort; private String status; private Date releaseOrCreateDate; + private String dateToDisplayOnCard; /** * @todo: how important is it to differentiate between name and title? @@ -44,6 +45,7 @@ public class SolrSearchResult { // private boolean statePublished; private boolean unpublishedState; private boolean draftState; + private long datasetVersionId; // public boolean isStatePublished() { // return statePublished; @@ -100,17 +102,6 @@ public String getNameHighlightSnippet() { return null; } - public String getCitationHighlightSnippet() { - Highlight highlight = highlightsAsMap.get(SearchFields.CITATION); - if (highlight != null) { - String firstSnippet = highlight.getSnippets().get(0); - if (firstSnippet != null) { - return firstSnippet; - } - } - return null; - } - public String getDataverseAffiliationHighlightSnippet() { Highlight highlight = highlightsAsMap.get(SearchFields.AFFILIATION); if (highlight != null) { @@ -238,6 +229,7 @@ public JsonObject toJsonObject() { * @todo: don't hard code this */ typeSpecificFields.add("title_s", this.title); + typeSpecificFields.add(SearchFields.DATASET_VERSION_ID, this.datasetVersionId); } else if (this.type.equals("files")) { typeSpecificFields.add(SearchFields.NAME, this.name); typeSpecificFields.add(SearchFields.FILE_TYPE_MIME, this.filetype); @@ -329,7 +321,6 @@ public List getHighlightsAsList() { && !field.equals(SearchFields.DESCRIPTION) && !field.equals(SearchFields.DATASET_DESCRIPTION) && !field.equals(SearchFields.AFFILIATION) - && !field.equals(SearchFields.CITATION) && !field.equals("title")) { filtered.add(highlight); } @@ -404,4 +395,21 @@ public Date getReleaseOrCreateDate() { public void setReleaseOrCreateDate(Date releaseOrCreateDate) { this.releaseOrCreateDate = releaseOrCreateDate; } + + public String getDateToDisplayOnCard() { + return dateToDisplayOnCard; + } + + public void setDateToDisplayOnCard(String dateToDisplayOnCard) { + this.dateToDisplayOnCard = dateToDisplayOnCard; + } + + public long getDatasetVersionId() { + return datasetVersionId; + } + + public void setDatasetVersionId(long datasetVersionId) { + this.datasetVersionId = datasetVersionId; + } + } diff --git a/src/main/java/edu/harvard/iq/dataverse/ValidateDatasetFieldType.java b/src/main/java/edu/harvard/iq/dataverse/ValidateDatasetFieldType.java index 8ae89a9dd6e..ae7b4a1eaef 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ValidateDatasetFieldType.java +++ b/src/main/java/edu/harvard/iq/dataverse/ValidateDatasetFieldType.java @@ -18,7 +18,7 @@ @Target({TYPE, ANNOTATION_TYPE}) @Retention(RUNTIME) -@Constraint(validatedBy = DatasetFieldTypeValidator.class) +@Constraint(validatedBy = {DatasetFieldValidator.class, DatasetFieldValueValidator.class}) @Documented public @interface ValidateDatasetFieldType { diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Access.java b/src/main/java/edu/harvard/iq/dataverse/api/Access.java index 1f839570e7e..35c4cea4e80 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Access.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Access.java @@ -104,7 +104,7 @@ public DownloadInstance datafile(@PathParam("fileId") Long fileId, @Context UriI * (and yes, this is a hack) * TODO: un-hack this. -- L.A. 4.0 alpha 1 */ - if (df.getContentType() != null && df.getContentType().startsWith("image/")) { + if (df.getContentType() != null && (df.getContentType().startsWith("image/") || df.getContentType().equalsIgnoreCase("application/pdf"))) { dInfo.addServiceAvailable(new OptionalAccessService("thumbnail", "image/png", "imageThumb=true", "Image Thumbnail (64x64)")); } @@ -153,35 +153,42 @@ public DownloadInstance datafile(@PathParam("fileId") Long fileId, @Context UriI @Path("imagethumb/{fileSystemId}") @GET - @Produces({ "image/png" }) - public InputStream imagethumb(@PathParam("fileSystemId") Long fileSystemId, @Context UriInfo uriInfo, @Context HttpHeaders headers, @Context HttpServletResponse response) /*throws NotFoundException, ServiceUnavailableException, PermissionDeniedException, AuthorizationRequiredException*/ { + @Produces({"image/png"}) + public InputStream imagethumb(@PathParam("fileSystemId") Long fileSystemId, @Context UriInfo uriInfo, @Context HttpHeaders headers, @Context HttpServletResponse response) /*throws NotFoundException, ServiceUnavailableException, PermissionDeniedException, AuthorizationRequiredException*/ { String filesRootDirectory = System.getProperty("dataverse.files.directory"); if (filesRootDirectory == null || filesRootDirectory.equals("")) { filesRootDirectory = "/tmp/files"; } - + String fileSystemName = filesRootDirectory + "/temp/" + fileSystemId; - String imageThumbFileName = ImageThumbConverter.generateImageThumb(fileSystemName); - if (imageThumbFileName != null) { - InputStream in; + + String mimeTypeParam = uriInfo.getQueryParameters().getFirst("mimetype"); + String imageThumbFileName = null; + + if ("application/pdf".equals(mimeTypeParam)) { + imageThumbFileName = ImageThumbConverter.generatePDFThumb(fileSystemName); + } else { + imageThumbFileName = ImageThumbConverter.generateImageThumb(fileSystemName); + } + + if (imageThumbFileName == null) { + imageThumbFileName = getWebappImageResource(DEFAULT_FILE_ICON); + } - try { - in = new FileInputStream(imageThumbFileName); - } catch (Exception ex) { - // We don't particularly care what the reason why we have - // failed to access the file was. - // From the point of view of the download subsystem, it's a - // binary operation -- it's either successfull or not. - // If we can't access it for whatever reason, we are saying - // it's 404 NOT FOUND in our HTTP response. - return null; - } - return in; + InputStream in; + + try { + in = new FileInputStream(imageThumbFileName); + } catch (Exception ex) { + + return null; } + return in; - return null; } + + @Path("preview/{fileId}") @GET @Produces({ "image/png" }) @@ -197,7 +204,9 @@ public InputStream preview(@PathParam("fileId") Long fileId, @Context UriInfo ur } String imageThumbFileName = null; - if (df != null && df.isImage()) { + if (df != null && ("application/pdf".equalsIgnoreCase(df.getContentType()))) { + imageThumbFileName = ImageThumbConverter.generatePDFThumb(df.getFileSystemLocation().toString(), 48); + } else if (df != null && df.isImage()) { imageThumbFileName = ImageThumbConverter.generateImageThumb(df.getFileSystemLocation().toString(), 48); } else { imageThumbFileName = getWebappImageResource (DEFAULT_FILE_ICON); @@ -209,12 +218,6 @@ public InputStream preview(@PathParam("fileId") Long fileId, @Context UriInfo ur try { in = new FileInputStream(imageThumbFileName); } catch (Exception ex) { - // We don't particularly care what the reason why we have - // failed to access the file was. - // From the point of view of the download subsystem, it's a - // binary operation -- it's either successfull or not. - // If we can't access it for whatever reason, we are saying - // it's 404 NOT FOUND in our HTTP response. return null; } return in; @@ -241,7 +244,10 @@ public InputStream dsPreview(@PathParam("datasetId") Long datasetId, @Context Ur List dataFiles = dataset.getFiles(); for (DataFile dataFile : dataFiles) { - if (dataFile.isImage()) { + if ("application/pdf".equalsIgnoreCase(dataFile.getContentType())) { + imageThumbFileName = ImageThumbConverter.generatePDFThumb(dataFile.getFileSystemLocation().toString(), 48); + break; + } else if (dataFile.isImage()) { imageThumbFileName = ImageThumbConverter.generateImageThumb(dataFile.getFileSystemLocation().toString(), 48); break; } @@ -257,12 +263,6 @@ public InputStream dsPreview(@PathParam("datasetId") Long datasetId, @Context Ur try { in = new FileInputStream(imageThumbFileName); } catch (Exception ex) { - // We don't particularly care what the reason why we have - // failed to access the file was. - // From the point of view of the download subsystem, it's a - // binary operation -- it's either successfull or not. - // If we can't access it for whatever reason, we are saying - // it's 404 NOT FOUND in our HTTP response. return null; } return in; diff --git a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstance.java b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstance.java index 1a3c6575541..d25a9838150 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstance.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/DownloadInstance.java @@ -3,70 +3,69 @@ * To change this template file, choose Tools | Templates * and open the template in the editor. */ - package edu.harvard.iq.dataverse.api; //import java.io.ByteArrayOutputStream; import java.util.List; import edu.harvard.iq.dataverse.dataaccess.OptionalAccessService; + /** * * @author Leonid Andreev */ public class DownloadInstance { /* - private ByteArrayOutputStream outStream = null; + private ByteArrayOutputStream outStream = null; - public ByteArrayOutputStream getOutStream() { - return outStream; - } + public ByteArrayOutputStream getOutStream() { + return outStream; + } - public void setOutStream(ByteArrayOutputStream outStream) { - this.outStream = outStream; - } - */ - - private DownloadInfo downloadInfo = null; - private String conversionParam = null; - private String conversionParamValue = null; - - public DownloadInstance (DownloadInfo info) { - this.downloadInfo = info; + public void setOutStream(ByteArrayOutputStream outStream) { + this.outStream = outStream; + } + */ + + private DownloadInfo downloadInfo = null; + private String conversionParam = null; + private String conversionParamValue = null; + + public DownloadInstance(DownloadInfo info) { + this.downloadInfo = info; } - - public DownloadInfo getDownloadInfo () { - return downloadInfo; + + public DownloadInfo getDownloadInfo() { + return downloadInfo; } - - public void setDownloadInfo (DownloadInfo info) { - this.downloadInfo = info; + + public void setDownloadInfo(DownloadInfo info) { + this.downloadInfo = info; } - - public String getConversionParam () { - return conversionParam; + + public String getConversionParam() { + return conversionParam; } - - public void setConversionParam (String param) { - this.conversionParam = param; + + public void setConversionParam(String param) { + this.conversionParam = param; } - - public String getConversionParamValue () { - return conversionParamValue; + + public String getConversionParamValue() { + return conversionParamValue; } - - public void setConversionParamValue (String paramValue) { - this.conversionParamValue = paramValue; + + public void setConversionParamValue(String paramValue) { + this.conversionParamValue = paramValue; } - + // Move this method into the DownloadInfo instead -- ? - - public Boolean isDownloadServiceSupported (String serviceArg, String serviceArgValue) { + public Boolean isDownloadServiceSupported(String serviceArg, String serviceArgValue) { if (downloadInfo == null || serviceArg == null) { return false; } - + List servicesAvailable = downloadInfo.getServicesAvailable(); - + for (OptionalAccessService dataService : servicesAvailable) { if (dataService != null) { // Special case for the subsetting parameter (variables=): @@ -77,56 +76,56 @@ public Boolean isDownloadServiceSupported (String serviceArg, String serviceArgV // return true; // } //} else { - if ("imageThumb".equals(serviceArg)) { - if ("true".equals(serviceArgValue)) { - this.conversionParam = serviceArg; - this.conversionParamValue = ""; - } else { - this.conversionParam = serviceArg; - this.conversionParamValue = serviceArgValue; - } - return true; - } - String argValuePair = serviceArg + "=" + serviceArgValue; - if (argValuePair.startsWith(dataService.getServiceArguments())) { - conversionParam = serviceArg; - conversionParamValue = serviceArgValue; - return true; + if ("imageThumb".equals(serviceArg)) { + if ("true".equals(serviceArgValue)) { + this.conversionParam = serviceArg; + this.conversionParamValue = ""; + } else { + this.conversionParam = serviceArg; + this.conversionParamValue = serviceArgValue; } + return true; + } + String argValuePair = serviceArg + "=" + serviceArgValue; + if (argValuePair.startsWith(dataService.getServiceArguments())) { + conversionParam = serviceArg; + conversionParamValue = serviceArgValue; + return true; + } //} } } - return false; + return false; } - - public String getServiceFormatType (String serviceArg, String serviceArgValue) { + + public String getServiceFormatType(String serviceArg, String serviceArgValue) { if (downloadInfo == null || serviceArg == null) { return null; } - + List servicesAvailable = downloadInfo.getServicesAvailable(); - + for (OptionalAccessService dataService : servicesAvailable) { if (dataService != null) { // Special case for the subsetting parameter (variables=): if (serviceArg.equals("variables")) { if ("subset".equals(dataService.getServiceName())) { conversionParam = "subset"; - conversionParamValue = serviceArgValue; - return dataService.getMimeType(); + conversionParamValue = serviceArgValue; + return dataService.getMimeType(); } } else if (serviceArg.equals("imageThumb")) { return "image/png"; } else { - String argValuePair = serviceArg + "=" + serviceArgValue; + String argValuePair = serviceArg + "=" + serviceArgValue; if (argValuePair.equals(dataService.getServiceArguments())) { - conversionParam = serviceArg; - conversionParamValue = serviceArgValue; - return dataService.getMimeType(); + conversionParam = serviceArg; + conversionParamValue = serviceArgValue; + return dataService.getMimeType(); } } } } - return null; + return null; } } diff --git a/src/main/java/edu/harvard/iq/dataverse/api/SearchFields.java b/src/main/java/edu/harvard/iq/dataverse/api/SearchFields.java index db34b8aa3d4..bd92490caa7 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/SearchFields.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/SearchFields.java @@ -48,7 +48,6 @@ public class SearchFields { * "Author Affiliation" can be multivalued. */ public static final String AFFILIATION = "affiliation_ss"; - public static final String CITATION = "citation_t"; /** * @todo: use a field called "author" instead. Solr default has "author" as * "text_general" so the field is tokenized ("Foo Bar" becomes "foo" "bar" @@ -91,6 +90,7 @@ public class SearchFields { public static final String NAME_SORT = "name_sort"; public static final String PUBLICATION_DATE = "publication_date_s"; public static final String RELEASE_OR_CREATE_DATE = "release_or_create_date_dt"; + public static final String RELEASE_OR_CREATE_DATE_SEARCHABLE_TEXT = "date_en"; public static final String GROUPS = "groups_s"; public static final String PERMS = "perms_ss"; public static final String PUBLICATION_STATUS = "published_ss"; @@ -100,5 +100,6 @@ public class SearchFields { public static final String PARENT_ID = "parentid"; public static final String DATASET_DESCRIPTION = "dsDescription"; + public static final String DATASET_VERSION_ID = "dataset_version_id_l"; } diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/ImageThumbConverter.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/ImageThumbConverter.java index 3804d291f4f..1896f82c647 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/ImageThumbConverter.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/ImageThumbConverter.java @@ -51,40 +51,43 @@ public static FileAccessObject getImageThumb (DataFile file, FileAccessObject fi return getImageThumb (file, fileDownload, DEFAULT_THUMBNAIL_SIZE); } - public static FileAccessObject getImageThumb (DataFile file, FileAccessObject fileDownload, int size) { + public static FileAccessObject getImageThumb(DataFile file, FileAccessObject fileDownload, int size) { + String imageThumbFileName = null; + if (file != null && file.getContentType().substring(0, 6).equalsIgnoreCase("image/")) { - String imageThumbFileName = generateImageThumb(file.getFileSystemLocation().toString(), size); - if (imageThumbFileName != null) { - File imageThumbFile = new File(imageThumbFileName); - - if (imageThumbFile != null && imageThumbFile.exists()) { - - fileDownload.closeInputStream(); - fileDownload.setSize(imageThumbFile.length()); - - - InputStream imageThumbInputStream = null; - - try { - - imageThumbInputStream = new FileInputStream(imageThumbFile); - } catch (IOException ex) { - return null; - } - - if (imageThumbInputStream != null) { - fileDownload.setInputStream(imageThumbInputStream); - fileDownload.setIsLocalFile(true); - - fileDownload.setMimeType("image/png"); - } else { - return null; - } + imageThumbFileName = generateImageThumb(file.getFileSystemLocation().toString(), size); + } else if (file != null && file.getContentType().equalsIgnoreCase("application/pdf")) { + imageThumbFileName = generatePDFThumb(file.getFileSystemLocation().toString(), size); + } + + if (imageThumbFileName != null) { + File imageThumbFile = new File(imageThumbFileName); + + if (imageThumbFile != null && imageThumbFile.exists()) { + + fileDownload.closeInputStream(); + fileDownload.setSize(imageThumbFile.length()); + + InputStream imageThumbInputStream = null; + + try { + + imageThumbInputStream = new FileInputStream(imageThumbFile); + } catch (IOException ex) { + return null; + } + + if (imageThumbInputStream != null) { + fileDownload.setInputStream(imageThumbInputStream); + fileDownload.setIsLocalFile(true); + + fileDownload.setMimeType("image/png"); + } else { + return null; } } - } - - + } + return fileDownload; } @@ -185,4 +188,54 @@ public static String generateImageThumb(String fileLocation, int size) { return null; } } + + public static String generatePDFThumb(String fileLocation) { + return generatePDFThumb(fileLocation, DEFAULT_THUMBNAIL_SIZE); + } + + public static String generatePDFThumb(String fileLocation, int size) { + + String thumbFileLocation = fileLocation + ".thumb" + size; + + // see if the thumb is already generated and saved: + + if (new File(thumbFileLocation).exists()) { + return thumbFileLocation; + } + + // doesn't exist yet, let's attempt to generate it: + + String imageMagickExec = System.getProperty("dataverse.path.imagemagick.convert"); + + if ( imageMagickExec != null ) { + imageMagickExec = imageMagickExec.trim(); + } + + // default location: + + if ( imageMagickExec == null || imageMagickExec.equals("") ) { + imageMagickExec = "/usr/bin/convert"; + } + + if (new File(imageMagickExec).exists()) { + + String ImageMagick = imageMagickExec + " pdf:" + fileLocation + "[0] -resize "+ size + " -flatten png:" + thumbFileLocation; + int exitValue = 1; + + try { + Runtime runtime = Runtime.getRuntime(); + Process process = runtime.exec(ImageMagick); + exitValue = process.waitFor(); + } catch (Exception e) { + exitValue = 1; + } + + if (exitValue == 0 && new File(thumbFileLocation).exists()) { + return thumbFileLocation; + } + } + + return null; + + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateDatasetCommand.java index d53843c81a6..15dfbb71f2e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateDatasetCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateDatasetCommand.java @@ -1,5 +1,6 @@ package edu.harvard.iq.dataverse.engine.command.impl; +import edu.harvard.iq.dataverse.DataFile; import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetField; import edu.harvard.iq.dataverse.DataverseRole; @@ -50,11 +51,11 @@ public Dataset execute(CommandContext ctxt) throws CommandException { while (dsfItSort.hasNext()) { dsfItSort.next().setValueDisplayOrder(); } - return save(ctxt); - } - - public Dataset save(CommandContext ctxt) { - theDataset.getEditVersion().setCreateTime(new Timestamp(new Date().getTime())); + Date createDate = new Timestamp(new Date().getTime()); + theDataset.getEditVersion().setCreateTime(createDate); + for (DataFile dataFile: theDataset.getFiles() ){ + dataFile.setCreateDate(theDataset.getCreateDate()); + } Dataset savedDataset = ctxt.em().merge(theDataset); String indexingResult = ctxt.index().indexDataset(savedDataset); logger.log(Level.INFO, "during dataset save, indexing result was: {0}", indexingResult); diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DeleteDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DeleteDatasetCommand.java index 128cbde071c..3fd862af97a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DeleteDatasetCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DeleteDatasetCommand.java @@ -1,29 +1,24 @@ package edu.harvard.iq.dataverse.engine.command.impl; -import edu.harvard.iq.dataverse.DataFile; import edu.harvard.iq.dataverse.Dataset; -import edu.harvard.iq.dataverse.DatasetVersion; import edu.harvard.iq.dataverse.DataverseUser; import edu.harvard.iq.dataverse.engine.Permission; -import edu.harvard.iq.dataverse.engine.command.AbstractVoidCommand; import edu.harvard.iq.dataverse.engine.command.CommandContext; import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; import edu.harvard.iq.dataverse.engine.command.exception.CommandException; import edu.harvard.iq.dataverse.engine.command.exception.IllegalCommandException; -import java.util.logging.Level; -import java.util.logging.Logger; /** * Deletes a data set. * @author michael */ @RequiredPermissions( Permission.DestructiveEdit ) -public class DeleteDatasetCommand extends AbstractVoidCommand { +public class DeleteDatasetCommand extends DestroyDataverseCommand { private final Dataset doomed; public DeleteDatasetCommand(Dataset doomed, DataverseUser aUser) { - super(aUser, doomed.getOwner()); + super(doomed, aUser); this.doomed = doomed; } @@ -32,24 +27,7 @@ protected void executeImpl(CommandContext ctxt) throws CommandException { if ( doomed.isReleased() ) { throw new IllegalCommandException("Cannot delete a released dataset", this); } - - final Dataset managedDoomed = ctxt.em().merge(doomed); - - // files - for ( DataFile df : managedDoomed.getFiles() ) { - ctxt.engine().submit( new DeleteDataFileCommand(df, getUser(), managedDoomed.getOwner()) ); - } - - // versions - for ( DatasetVersion ver : managedDoomed.getVersions() ) { - Logger.getLogger(DeleteDatasetCommand.class.getName()).log(Level.INFO, "deleting " + ver ); - DatasetVersion managed = ctxt.em().merge(ver); - Logger.getLogger(DeleteDatasetCommand.class.getName()).log(Level.INFO, " - Managed: " + managed ); - ctxt.em().remove( managed ); - } - - // dataset - ctxt.em().remove(managedDoomed); + super.executeImpl(ctxt); } } diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DestroyDataverseCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DestroyDataverseCommand.java new file mode 100644 index 00000000000..2d5a311db18 --- /dev/null +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DestroyDataverseCommand.java @@ -0,0 +1,61 @@ +package edu.harvard.iq.dataverse.engine.command.impl; + +import edu.harvard.iq.dataverse.DataFile; +import edu.harvard.iq.dataverse.Dataset; +import edu.harvard.iq.dataverse.DatasetVersion; +import edu.harvard.iq.dataverse.DataverseRole; +import edu.harvard.iq.dataverse.DataverseUser; +import edu.harvard.iq.dataverse.RoleAssignment; +import edu.harvard.iq.dataverse.engine.Permission; +import edu.harvard.iq.dataverse.engine.command.AbstractVoidCommand; +import edu.harvard.iq.dataverse.engine.command.CommandContext; +import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; +import edu.harvard.iq.dataverse.engine.command.exception.CommandException; +import java.util.logging.Level; +import java.util.logging.Logger; + +/** + * Same as {@link DeleteDataversCommand}, but does not stop it the dataset is published. + * This command is reserved for super-users, if at all. + * @author michael + */ +@RequiredPermissions( Permission.DestructiveEdit ) +public class DestroyDataverseCommand extends AbstractVoidCommand { + + private final Dataset doomed; + + public DestroyDataverseCommand(Dataset doomed, DataverseUser aUser) { + super(aUser, doomed.getOwner()); + this.doomed = doomed; + } + + @Override + protected void executeImpl(CommandContext ctxt) throws CommandException { + + final Dataset managedDoomed = ctxt.em().merge(doomed); + + // ASSIGNMENTS + for ( RoleAssignment ra : ctxt.roles().directRoleAssignments(doomed) ) { + ctxt.em().remove(ra); + } + // ROLES + for ( DataverseRole ra : ctxt.roles().findByOwnerId(doomed.getId()) ) { + ctxt.em().remove(ra); + } + + // files + for ( DataFile df : managedDoomed.getFiles() ) { + ctxt.engine().submit( new DeleteDataFileCommand(df, getUser(), managedDoomed.getOwner()) ); + } + + // versions + for ( DatasetVersion ver : managedDoomed.getVersions() ) { + DatasetVersion managed = ctxt.em().merge(ver); + ctxt.em().remove( managed ); + } + + // dataset + ctxt.em().remove(managedDoomed); + } + +} diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/ReleaseDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/ReleaseDatasetCommand.java index 8dc6a55dbf8..76a637478bf 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/ReleaseDatasetCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/ReleaseDatasetCommand.java @@ -5,6 +5,7 @@ */ package edu.harvard.iq.dataverse.engine.command.impl; +import edu.harvard.iq.dataverse.DataFile; import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetVersion; import edu.harvard.iq.dataverse.DataverseUser; @@ -64,8 +65,15 @@ public Dataset execute(CommandContext ctxt) throws CommandException { } } - theDataset.getEditVersion().setReleaseTime(new Timestamp(new Date().getTime())); + Timestamp updateTime = new Timestamp(new Date().getTime()); + theDataset.getEditVersion().setReleaseTime(updateTime); theDataset.getEditVersion().setVersionState(DatasetVersion.VersionState.RELEASED); + + for (DataFile dataFile: theDataset.getFiles() ){ + if(dataFile.getPublicationDate() == null){ + dataFile.setPublicationDate(updateTime); + } + } Dataset savedDataset = ctxt.em().merge(theDataset); String indexingResult = ctxt.index().indexDataset(savedDataset); diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateDatasetCommand.java index 31771d35a80..e5c2cd1c250 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateDatasetCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateDatasetCommand.java @@ -5,6 +5,7 @@ */ package edu.harvard.iq.dataverse.engine.command.impl; +import edu.harvard.iq.dataverse.DataFile; import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetField; import edu.harvard.iq.dataverse.DataverseUser; @@ -14,6 +15,8 @@ import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; import edu.harvard.iq.dataverse.engine.command.RequiredPermissionsMap; import edu.harvard.iq.dataverse.engine.command.exception.CommandException; +import java.sql.Timestamp; +import java.util.Date; import java.util.Iterator; import java.util.logging.Logger; @@ -55,9 +58,18 @@ public Dataset save(CommandContext ctxt) { while (dsfItSort.hasNext()) { dsfItSort.next().setValueDisplayOrder(); } - String indexingResult = ctxt.index().indexDataset(theDataset); - logger.info("during dataset save, indexing result was: " + indexingResult); + Timestamp updateTime = new Timestamp(new Date().getTime()); + + for (DataFile dataFile: theDataset.getFiles() ){ + if(dataFile.getCreateDate() == null){ + dataFile.setCreateDate(updateTime); + } + } + //String indexingResult = ctxt.index().indexDataset(theDataset); + //logger.info("during dataset save, indexing result was: " + indexingResult); Dataset savedDataset = ctxt.em().merge(theDataset); + String indexingResult = ctxt.index().indexDataset(savedDataset); + logger.info("during dataset save, indexing result was: " + indexingResult); return savedDataset; } diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java index b957eace269..3db2fad333d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java @@ -52,6 +52,10 @@ import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.csv.CSVFileReaderSpi; import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.xlsx.XLSXFileReader; import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.xlsx.XLSXFileReaderSpi; +import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.sav.SAVFileReader; +import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.sav.SAVFileReaderSpi; +import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.por.PORFileReader; +import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.por.PORFileReaderSpi; import edu.harvard.iq.dataverse.util.FileUtil; import edu.harvard.iq.dataverse.util.SumStatCalculator; import java.io.BufferedInputStream; @@ -124,6 +128,8 @@ public class IngestServiceBean { private static final String MIME_TYPE_RDATA = "application/x-rlang-transport"; private static final String MIME_TYPE_CSV = "text/csv"; private static final String MIME_TYPE_XLSX = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"; + private static final String MIME_TYPE_SPSS_SAV = "application/x-spss-sav"; + private static final String MIME_TYPE_SPSS_POR = "application/x-spss-por"; private static final String MIME_TYPE_TAB = "text/tab-separated-values"; @@ -307,6 +313,10 @@ public boolean ingestableAsTabular(DataFile dataFile) { return true; } else if (mimeType.equals(MIME_TYPE_XLSX)) { return true; + } else if (mimeType.equals(MIME_TYPE_SPSS_SAV)) { + return true; + } else if (mimeType.equals(MIME_TYPE_SPSS_POR)) { + return true; } return false; @@ -335,6 +345,10 @@ private TabularDataFileReader getTabDataReaderByMimeType(DataFile dataFile) { ingestPlugin = new CSVFileReader(new CSVFileReaderSpi()); } else if (mimeType.equals(MIME_TYPE_XLSX)) { ingestPlugin = new XLSXFileReader(new XLSXFileReaderSpi()); + } else if (mimeType.equals(MIME_TYPE_SPSS_SAV)) { + ingestPlugin = new DTAFileReader(new SAVFileReaderSpi()); + } else if (mimeType.equals(MIME_TYPE_SPSS_POR)) { + ingestPlugin = new DTAFileReader(new PORFileReaderSpi()); } return ingestPlugin; diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/InvalidData.java b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/InvalidData.java new file mode 100644 index 00000000000..4f584abc4cb --- /dev/null +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/InvalidData.java @@ -0,0 +1,137 @@ +/* + Copyright (C) 2005-2012, by the President and Fellows of Harvard College. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + Dataverse Network - A web application to share, preserve and analyze research data. + Developed at the Institute for Quantitative Social Science, Harvard University. + Version 3.0. +*/ +package edu.harvard.iq.dataverse.ingest.tabulardata; + +import java.util.*; +import org.apache.commons.lang.builder.ToStringBuilder; +import org.apache.commons.lang.builder.ToStringStyle; +/** + * A class that stores information about a variables' invalid data. + * Akio Sone's original DVN v.2.* implementation, virtually unchanged. + * + * @author Akio Sone + * + * incorporated into Dataverse 4.0 by Leonid Andreev, 2014 + */ + +public class InvalidData { + + + public InvalidData(int type) { + this.type = type; + } + + int type; + + public int getType() { + return type; + } + + public void setType(int type) { + this.type = type; + } + + List invalidValues; + + public List getInvalidValues() { + return invalidValues; + } + + + public void setInvalidValues(List invalidValues) { + this.invalidValues = invalidValues; + } + + List invalidRange; + + public List getInvalidRange() { + return invalidRange; + } + + + public void setInvalidRange(List invalidRange) { + this.invalidRange = invalidRange; + } + + + + /* + * This method used to be used by the old DDIWriter. + * TODO: check how these values were affecting the behavior of + * the old DDIService import; implement the direct configuration + * of DataVariables as appropriate. + * -- L.A. 4.0 beta + */ + public String toDDItag(){ + StringBuilder sb = new StringBuilder(); + + switch(type){ + case 1: case 2: case 3: + sb.append("\t\t\n"); + for (int k=0; k < invalidValues.size();k++){ + sb.append("\t\t\t\n"); + } + sb.append("\t\t\n"); + break; + case -2: + // range-type 1 missing values + sb.append("\t\t\n"); + sb.append("\t\t\t\n"); + sb.append("\t\t\n"); + break; + case -3: + // range-type: 2 missing values + sb.append("\t\t\n"); + sb.append("\t\t\t\n"); + sb.append("\t\t\t\n"); + sb.append("\t\t\n"); + + break; + default: + } + return sb.toString(); + } + + /** + * Returns a string representation of this instance. + * + * @return a string representing this instance. + */ + @Override + public String toString() { + return ToStringBuilder.reflectionToString(this, + ToStringStyle.MULTI_LINE_STYLE); + } + +} diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/por/PORFileReader.java b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/por/PORFileReader.java new file mode 100644 index 00000000000..84fb5fbc559 --- /dev/null +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/por/PORFileReader.java @@ -0,0 +1,1676 @@ +/* + Copyright (C) 2005-2012, by the President and Fellows of Harvard College. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + Dataverse Network - A web application to share, preserve and analyze research data. + Developed at the Institute for Quantitative Social Science, Harvard University. + Version 3.0. +*/ +package edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.por; + +import java.io.*; +import java.nio.*; +import java.util.logging.*; + +import java.util.*; +import java.util.regex.*; +import java.text.*; +import java.math.BigDecimal; +import java.math.MathContext; +import java.math.RoundingMode; + +import org.apache.commons.lang.*; +import org.apache.commons.codec.binary.Hex; +import javax.inject.Inject; +import javax.naming.Context; +import javax.naming.InitialContext; +import javax.naming.NamingException; + +import edu.harvard.iq.dataverse.DataTable; +import edu.harvard.iq.dataverse.datavariable.DataVariable; +import edu.harvard.iq.dataverse.datavariable.VariableCategory; +import edu.harvard.iq.dataverse.datavariable.VariableFormatType; +import edu.harvard.iq.dataverse.datavariable.VariableServiceBean; + +import edu.harvard.iq.dataverse.ingest.plugin.spi.*; +import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataFileReader; +import edu.harvard.iq.dataverse.ingest.tabulardata.spi.TabularDataFileReaderSpi; +import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest; +import edu.harvard.iq.dataverse.ingest.tabulardata.InvalidData; +import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.sav.SPSSConstants; + + + +/** + * ingest plugin for SPSS/POR ("portable") file format. + * + * This reader plugin has been fully re-implemented for the DVN 4.0; + * It is still borrows heavily from, and builds on the basis of the + * old implementation by Akio Sone, that was in use in the versions + * 2-3 of the DVN. + * + * @author Akio Sone at UNC-Odum + * @author Leonid Andreev + */ + +public class PORFileReader extends TabularDataFileReader{ + @Inject + VariableServiceBean varService; + + // static fields ---------------------------------------------------------// + private static final String MissingValueForTextDataFile = ""; + + private TabularDataIngest ingesteddata = new TabularDataIngest(); + private DataTable dataTable = new DataTable(); + + private static final int POR_HEADER_SIZE = 500; + private static final int POR_MARK_POSITION_DEFAULT = 461; + private static final String POR_MARK = "SPSSPORT"; + private static final int LENGTH_SECTION_HEADER = 1; + private static final int LENGTH_SECTION_2 = 19; + private static final String MIME_TYPE = "application/x-spss-por"; + private static Pattern pattern4positiveInteger = Pattern.compile("[0-9A-T]+"); + private static Pattern pattern4Integer = Pattern.compile("[-]?[0-9A-T]+"); + private static Calendar GCO = new GregorianCalendar(); + static { + // set the origin of GCO to 1582-10-15 + GCO.set(1, 1582);// year + GCO.set(2, 9); // month + GCO.set(5, 15);// day of month + GCO.set(9, 0);// AM(0) or PM(1) + GCO.set(10, 0);// hh + GCO.set(12, 0);// mm + GCO.set(13, 0);// ss + GCO.set(14, 0); // SS millisecond + GCO.set(15, 0);// z + + } + private static final long SPSS_DATE_BIAS = 60*60*24*1000; + private static final long SPSS_DATE_OFFSET = SPSS_DATE_BIAS + Math.abs(GCO.getTimeInMillis()); + + + // instance fields -------------------------------------------------------// + + private static Logger dbgLog = Logger.getLogger(PORFileReader.class.getPackage().getName()); + + private boolean isCurrentVariableString = false; + private String currentVariableName = null; + + private int caseQnty=0; + private int varQnty=0; + + private Map variableTypeTable = new LinkedHashMap(); + private List variableTypelList = new ArrayList(); + private List printFormatList = new ArrayList(); + private Map printFormatTable = new LinkedHashMap(); + private Map printFormatNameTable = new LinkedHashMap(); + private Map formatCategoryTable = new LinkedHashMap(); + private Map> valueLabelTable = new LinkedHashMap>(); + private Map valueVariableMappingTable = new LinkedHashMap(); + private List variableNameList = new ArrayList(); + private Map variableLabelMap = new LinkedHashMap(); + // missing value table: string/numeric data are stored => String + // the number of missing values are unknown beforehand => List + private Map> missingValueTable = new LinkedHashMap>(); + // variableName=> missingValue type[field code] + private Map> missingValueCodeTable = new LinkedHashMap>(); + private Map invalidDataTable = new LinkedHashMap(); + private Set decimalVariableSet = new HashSet(); + private List formatDecimalPointPositionList= new ArrayList(); + + + + // date/time data format + private SimpleDateFormat sdf_ymd = new SimpleDateFormat("yyyy-MM-dd"); + private SimpleDateFormat sdf_ymdhms = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + private SimpleDateFormat sdf_dhms = new SimpleDateFormat("DDD HH:mm:ss"); + private SimpleDateFormat sdf_hms = new SimpleDateFormat("HH:mm:ss"); + + // DecimalFormat for doubles + // may need more setXXXX() to handle scientific data + private NumberFormat doubleNumberFormatter = new DecimalFormat(); + + private String[] variableFormatTypeList; + + + // Constructor -----------------------------------------------------------// + + public PORFileReader(TabularDataFileReaderSpi originator){ + super(originator); + } + + + private void init() throws IOException { + + Context ctx = null; + try { + ctx = new InitialContext(); + varService = (VariableServiceBean) ctx.lookup("java:global/dataverse-4.0/VariableServiceBean"); + } catch (NamingException nex) { + try { + ctx = new InitialContext(); + varService = (VariableServiceBean) ctx.lookup("java:global/dataverse/VariableServiceBean"); + } catch (NamingException nex2) { + if (dbgLog.isLoggable(Level.INFO)) dbgLog.info("Could not look up initial context, or the variable service in JNDI!"); + throw new IOException ("Could not look up initial context, or the variable service in JNDI!"); + } + } + sdf_ymd.setTimeZone(TimeZone.getTimeZone("GMT")); + sdf_ymdhms.setTimeZone(TimeZone.getTimeZone("GMT")); + sdf_dhms.setTimeZone(TimeZone.getTimeZone("GMT")); + sdf_hms.setTimeZone(TimeZone.getTimeZone("GMT")); + + doubleNumberFormatter.setGroupingUsed(false); + doubleNumberFormatter.setMaximumFractionDigits(340); // TODO: 340?? -- L.A. 4.0 beta + } + + public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws IOException{ + dbgLog.info("SAVFileReader: read() start"); + + if (dataFile != null) { + throw new IOException ("this plugin does not support external raw data files"); + } + + + File tempPORfile = decodeHeader(stream); + BufferedReader bfReader = null; + + try { + bfReader = new BufferedReader(new InputStreamReader(new FileInputStream(tempPORfile.getAbsolutePath()), "US-ASCII")); + if (bfReader == null){ + dbgLog.fine("bfReader is null"); + throw new IOException("bufferedReader is null"); + } + + decodeSec2(bfReader); + + while(true){ + + char[] header = new char[LENGTH_SECTION_HEADER]; // 1 byte + bfReader.read(header); + String headerId = Character.toString(header[0]); + + dbgLog.fine("////////////////////// headerId="+headerId+ "//////////////////////"); + + if (headerId.equals("Z")){ + throw new IOException("reading failure: wrong headerId(Z) here"); + } + + if (headerId.equals("F")) { + // missing value + if ((missingValueTable !=null) && (missingValueTable.size()>0)){ + processMissingValueData(); + } + } + + if (headerId.equals("8") && isCurrentVariableString){ + headerId = "8S"; + } + + decode(headerId, bfReader); + + + // for last iteration + if (headerId.equals("F")){ + // finished the last block (F == data) + // without reaching the end of this file. + break; + } + } + + + } finally { + try { + if (bfReader!= null){ + bfReader.close(); + } + } catch (IOException ex){ + ex.printStackTrace(); + } + + if (tempPORfile.exists()){ + tempPORfile.delete(); + } + } + + /* + * Finally, create data variables, assign types and formats; + * This is the code from 3.6: + // post-parsing processing + // save metadata to smd + // varialbe Name + + smd.setVariableName(variableNameList.toArray(new String[variableNameList.size()])); + smd.setVariableLabel(variableLabelMap); + smd.setMissingValueTable(missingValueTable); + dbgLog.finer("*************** missingValueCodeTable ***************:\n" + missingValueCodeTable); + smd.setInvalidDataTable(invalidDataTable); + smd.setValueLabelTable(valueLabelTable); + * TODO: make sure all of this is taken care of by the new plugin! + + * TODO: (maybe?) + * Instead of doing it here, perhaps all the type assignments need to + * be done on DataVariable objects directly; without relying on + * maps and lists here... -- L.A. 4.0 beta (?) + */ + + + List variableList = new ArrayList(); + + for (int indx = 0; indx < variableTypelList.size(); indx++) { + + DataVariable dv = new DataVariable(); + String varName = variableNameList.get(indx); + dv.setName(varName); + dv.setLabel(variableLabelMap.get(varName)); + dv.setFormatSchemaName(printFormatNameTable.get(varName)); + + dv.setInvalidRanges(new ArrayList()); + dv.setSummaryStatistics( new ArrayList() ); + dv.setUnf("UNF:6:NOTCALCULATED"); + dv.setCategories(new ArrayList()); + variableList.add(dv); + + dv.setFileOrder(indx); + + dv.setDataTable(dataTable); + + int simpleType = 0; + if (variableTypelList.get(indx) != null) { + simpleType = variableTypelList.get(indx).intValue(); + } + + if (simpleType <= 0) { + // We need to make one last type adjustment: + // Dates and Times will be stored as character values in the + // dataverse tab files; even though they are not typed as + // strings at this point: + // TODO: + // Make sure the date/time format is properly preserved! + // (see the setFormatCategory below... but double-check!) + // -- L.A. 4.0 alpha + String variableFormatType = variableFormatTypeList[indx]; + if (variableFormatType != null + && (variableFormatType.equals("time") + || variableFormatType.equals("date"))) { + ///variableTypeMinimal[indx] = 1; + simpleType = 1; + + String formatCategory = formatCategoryTable.get(varName); + + if (formatCategory != null) { + dataTable.getDataVariables().get(indx).setFormatCategory(formatCategory); + } + } + } + + // OK, we can now assign the types: + + if (simpleType > 0) { + // String: + dataTable.getDataVariables().get(indx).setVariableFormatType(varService.findVariableFormatTypeByName("character")); + dataTable.getDataVariables().get(indx).setVariableIntervalType(varService.findVariableIntervalTypeByName("discrete")); + } else { + // Numeric: + dataTable.getDataVariables().get(indx).setVariableFormatType(varService.findVariableFormatTypeByName("numeric")); + // discrete or continuous? + // "decimal variables" become dataverse data variables of interval type "continuous": + + if (decimalVariableSet.contains(indx)) { + dataTable.getDataVariables().get(indx).setVariableIntervalType(varService.findVariableIntervalTypeByName("continuous")); + } else { + dataTable.getDataVariables().get(indx).setVariableIntervalType(varService.findVariableIntervalTypeByName("discrete")); + } + + } + } + + /* + * From the original (3.6) code: + //smd.setVariableTypeMinimal(ArrayUtils.toPrimitive(variableTypelList.toArray(new Integer[variableTypelList.size()]))); + smd.setVariableFormat(printFormatList); + smd.setVariableFormatName(printFormatNameTable); + smd.setVariableFormatCategory(formatCategoryTable); + smd.setValueLabelMappingTable(valueVariableMappingTable); + * TODO: + * double-check that it's all being taken care of by the new plugin! + * (for variable format and formatName, consult the SAV plugin) + */ + + dataTable.setDataVariables(variableList); + + // Assign value labels: + + assignValueLabels(valueLabelTable); + + ingesteddata.setDataTable(dataTable); + + dbgLog.info("SAVFileReader: read() end"); + return ingesteddata; + } + + private void decode(String headerId, BufferedReader reader) throws IOException{ + if (headerId.equals("1")) decodeProductName(reader); + else if (headerId.equals("2")) decodeLicensee(reader); + else if (headerId.equals("3")) decodeFileLabel(reader); + else if (headerId.equals("4")) decodeNumberOfVariables(reader); + else if (headerId.equals("5")) decodeFieldNo5(reader); + else if (headerId.equals("6")) decodeWeightVariable(reader); + else if (headerId.equals("7")) decodeVariableInformation(reader); + else if (headerId.equals("8")) decodeMissValuePointNumeric(reader); + else if (headerId.equals("8S")) decodeMissValuePointString(reader); + else if (headerId.equals("9")) decodeMissValueRangeLow(reader); + else if (headerId.equals("A")) decodeMissValueRangeHigh(reader); + else if (headerId.equals("B")) decodeMissValueRange(reader); + else if (headerId.equals("C")) decodeVariableLabel(reader); + else if (headerId.equals("D")) decodeValueLabel(reader); + else if (headerId.equals("E")) decodeDocument(reader); + else if (headerId.equals("F")) decodeData(reader); + } + + + private File decodeHeader(BufferedInputStream stream) throws IOException { + File tempPORfile = null; + + if (stream == null){ + throw new IllegalArgumentException("file == null!"); + } + + byte[] headerByes = new byte[POR_HEADER_SIZE]; + + if (stream.markSupported()){ + stream.mark(1000); + } + int nbytes = stream.read(headerByes, 0, POR_HEADER_SIZE); + + //printHexDump(headerByes, "hex dump of the byte-array"); + + if (nbytes == 0){ + throw new IOException("decodeHeader: reading failure"); + } else if ( nbytes < 491) { + // Size test: by defnition, it must have at least + // 491-byte header, i.e., the file size less than this threshold + // is not a POR file + dbgLog.fine("this file is NOT spss-por type"); + throw new IllegalArgumentException("file is not spss-por type"); + } + // rewind the current reading position back to the beginning + if (stream.markSupported()){ + stream.reset(); + } + + // line-terminating characters are usually one or two by defnition + // however, a POR file saved by a genuine SPSS for Windows + // had a three-character line terminator, i.e., failed to remove the + // original file's one-character terminator when it was opened, and + // saved it with the default two-character terminator without + // removing original terminators. So we have to expect such a rare + // case + // + // terminator + // windows [0D0A]=> [1310] = [CR/LF] + // unix [0A] => [10] + // mac [0D] => [13] + // 3char [0D0D0A]=> [131310] spss for windows rel 15 + // + // terminating characters should be found at the following + // column positions[counting from 0]: + // unix case: [0A] : [80], [161], [242], [323], [404], [485] + // windows case: [0D0A] : [81], [163], [245], [327], [409], [491] + // : [0D0D0A] : [82], [165], [248], [331], [414], [495] + + // convert b into a ByteBuffer + + ByteBuffer buff = ByteBuffer.wrap(headerByes); + byte[] nlch = new byte[36]; + int pos1; + int pos2; + int pos3; + int ucase = 0; + int wcase = 0; + int mcase = 0; + int three = 0; + int nolines = 6; + int nocols = 80; + for (int i = 0; i < nolines; ++i) { + int baseBias = nocols * (i + 1); + // 1-char case + pos1 = baseBias + i; + buff.position(pos1); + dbgLog.finer("\tposition(1)=" + buff.position()); + int j = 6 * i; + nlch[j] = buff.get(); + + if (nlch[j] == 10) { + ucase++; + } else if (nlch[j] == 13) { + mcase++; + } + + // 2-char case + pos2 = baseBias + 2 * i; + buff.position(pos2); + dbgLog.finer("\tposition(2)=" + buff.position()); + + nlch[j + 1] = buff.get(); + nlch[j + 2] = buff.get(); + + // 3-char case + pos3 = baseBias + 3 * i; + buff.position(pos3); + dbgLog.finer("\tposition(3)=" + buff.position()); + + nlch[j + 3] = buff.get(); + nlch[j + 4] = buff.get(); + nlch[j + 5] = buff.get(); + + dbgLog.finer(i + "-th iteration position =" + + nlch[j] + "\t" + nlch[j + 1] + "\t" + nlch[j + 2]); + dbgLog.finer(i + "-th iteration position =" + + nlch[j + 3] + "\t" + nlch[j + 4] + "\t" + nlch[j + 5]); + + if ((nlch[j + 3] == 13) && + (nlch[j + 4] == 13) && + (nlch[j + 5] == 10)) { + three++; + } else if ((nlch[j + 1] == 13) && (nlch[j + 2] == 10)) { + wcase++; + } + + buff.rewind(); + } + + boolean windowsNewLine = true; + if (three == nolines) { + windowsNewLine = false; // lineTerminator = "0D0D0A" + } else if ((ucase == nolines) && (wcase < nolines)) { + windowsNewLine = false; // lineTerminator = "0A" + } else if ((ucase < nolines) && (wcase == nolines)) { + windowsNewLine = true; //lineTerminator = "0D0A" + } else if ((mcase == nolines) && (wcase < nolines)) { + windowsNewLine = false; //lineTerminator = "0D" + } + + + buff.rewind(); + int PORmarkPosition = POR_MARK_POSITION_DEFAULT; + if (windowsNewLine) { + PORmarkPosition = PORmarkPosition + 5; + } else if (three == nolines) { + PORmarkPosition = PORmarkPosition + 10; + } + + byte[] pormark = new byte[8]; + buff.position(PORmarkPosition); + buff.get(pormark, 0, 8); + String pormarks = new String(pormark); + + //dbgLog.fine("pormark =>" + pormarks + "<-"); + dbgLog.fine("pormark[hex: 53 50 53 53 50 4F 52 54 == SPSSPORT] =>" + + new String(Hex.encodeHex(pormark)) + "<-"); + + if (pormarks.equals(POR_MARK)) { + dbgLog.fine("POR ID toke test: Passed"); + init(); + + dataTable.setOriginalFileFormat(MIME_TYPE); + dataTable.setUnf("UNF:6:NOTCALCULATED"); + + } else { + dbgLog.fine("this file is NOT spss-por type"); + throw new IllegalArgumentException( + "decodeHeader: POR ID token was not found"); + } + + // save the POR file without new line characters + + FileOutputStream fileOutPOR = null; + Writer fileWriter = null; + + // Scanner class can handle three-character line-terminator + Scanner porScanner = null; + + try { + tempPORfile = File.createTempFile("tempPORfile.", ".por"); + fileOutPOR = new FileOutputStream(tempPORfile); + fileWriter = new BufferedWriter(new OutputStreamWriter(fileOutPOR, "utf8")); + porScanner = new Scanner(stream); + + // Because 64-bit and 32-bit machines decode POR's first 40-byte + // sequence differently, the first 5 leader lines are skipped from + // the new-line-stripped file + + int lineCounter= 0; + while(porScanner.hasNextLine()){ + lineCounter++; + if (lineCounter<=5){ + String line = porScanner.nextLine().toString(); + dbgLog.fine("line="+lineCounter+":"+line.length()+":"+line); + } else { + fileWriter.write(porScanner.nextLine().toString()); + } + } + } finally { + try{ + if (fileWriter != null){ + fileWriter.close(); + } + } catch (IOException ex){ + ex.printStackTrace(); + } + + if (porScanner != null){ + porScanner.close(); + } + } + + return tempPORfile; + } + + + + private void decodeSec2(BufferedReader reader) throws IOException { + dbgLog.fine("decodeSec2(): start"); + if (reader ==null){ + throw new IllegalArgumentException("decodeSec2: stream == null!"); + } + + // Because a 64-bit machine may not save the first 40 + // bytes of a POR file in a way as a 32-bit machine does, + // the first 5 lines of a POR file is excluded from the read-back + // file and the new 1st line contains the format mark "SPSSPORT" + // somewhere in it. + + // mark the start position for the later rewind + if (reader.markSupported()){ + reader.mark(100000); + } + + + char[] sixthLineCharArray = new char[80]; + int nbytes_sixthLine = reader.read(sixthLineCharArray); + + String sixthLine = new String(sixthLineCharArray); + dbgLog.info("sixthLineCharArray="+ + Arrays.deepToString(ArrayUtils.toObject(sixthLineCharArray))); + int signatureLocation = sixthLine.indexOf(POR_MARK); + + if (signatureLocation >= 0){ + dbgLog.info("format signature was found at:"+signatureLocation); + } else { + dbgLog.severe("signature string was not found"); + throw new IOException("signature string was not found"); + } + + // rewind the position to the beginning + reader.reset(); + + // skip bytes up to the signature string + long skippedBytes = reader.skip(signatureLocation); + + char[] sec2_leader = new char[POR_MARK.length()]; + int nbytes_sec2_leader = reader.read(sec2_leader); + + String leader_string = new String(sec2_leader); + + dbgLog.info("format signature [SPSSPORT] detected="+leader_string); + + + if (leader_string.equals("SPSSPORT")){ + dbgLog.info("signature was correctly detected"); + + } else { + dbgLog.severe( + "the format signature is not found at the previously located column"); + throw new IOException("decodeSec2: failed to find the signature string"); + } + + int length_section_2 = LENGTH_SECTION_2; + + char[] Sec2_bytes = new char[length_section_2]; + + int nbytes_sec2 = reader.read(Sec2_bytes); + + if (nbytes_sec2 == 0){ + dbgLog.severe("decodeSec2: reading error"); + throw new IOException("decodeSec2: reading error"); + } else { + dbgLog.fine("bytes read="+nbytes_sec2); + } + + String sec2 = new String(Sec2_bytes); + dbgLog.fine("sec2[creation date/time]="+sec2); + + // sec2 + // 0123456789012345678 + // A8/YYYYMMDD6/HHMMSS + // thus + // section2 should has 3 elements + + String[] section2 = StringUtils.split(sec2, '/'); + + dbgLog.fine("section2="+StringUtils.join(section2, "|")); + + String fileCreationDate =null; + String fileCreationTime = null; + if ((section2.length == 3)&& (section2[0].startsWith("A"))){ + fileCreationDate = section2[1].substring(0,7); + fileCreationTime = section2[2]; + } else { + dbgLog.severe("decodeSec2: file creation date/time were not correctly detected"); + throw new IOException("decodeSec2: file creation date/time were not correctly detected"); + } + dbgLog.fine("fileCreationDate="+fileCreationDate); + dbgLog.fine("fileCreationTime="+fileCreationTime); + ///smd.getFileInformation().put("fileCreationDate", fileCreationDate); + ///smd.getFileInformation().put("fileCreationTime", fileCreationTime); + ///smd.getFileInformation().put("varFormat_schema", "SPSS"); + } + + + private void decodeProductName(BufferedReader reader) throws IOException { + if (reader ==null){ + throw new IllegalArgumentException("decodeProductName: reader == null!"); + } + + String productName = parseStringField(reader); + ///smd.getFileInformation().put("productName", productName); + } + + + private void decodeLicensee(BufferedReader reader) throws IOException { + if (reader ==null){ + throw new IllegalArgumentException("decodeLicensee: reader == null!"); + } + + String licenseeName = parseStringField(reader); + ///smd.getFileInformation().put("licenseeName", licenseeName); + } + + + private void decodeFileLabel(BufferedReader reader) throws IOException { + if (reader ==null){ + throw new IllegalArgumentException("decodeFileLabel: reader == null!"); + } + + String fileLabel = parseStringField(reader); + // TODO: is this "file label" potentially useful? -- L.A. 4.0 beta + ///smd.getFileInformation().put("fileLabel", fileLabel); + } + + + private void decodeNumberOfVariables(BufferedReader reader) throws IOException { + if (reader ==null){ + throw new IllegalArgumentException("decodeNumberOfVariables: reader == null!"); + } + + String temp = null; + char[] tmp = new char[1]; + StringBuilder sb = new StringBuilder(); + + while (reader.read(tmp) > 0) { + temp = Character.toString(tmp[0]); + if (temp.equals("/")) { + break; + } else { + sb.append(temp); + } + } + + String rawNumberOfVariables = sb.toString(); + int rawLength = rawNumberOfVariables.length(); + + String numberOfVariables = StringUtils.stripStart((StringUtils.strip(rawNumberOfVariables)), "0"); + + if ((numberOfVariables.equals("")) && (numberOfVariables.length() == rawLength)){ + numberOfVariables ="0"; + } + + varQnty = Integer.valueOf(numberOfVariables, 30); + dataTable.setVarQuantity(Long.valueOf(numberOfVariables, 30)); + } + + + private void decodeFieldNo5(BufferedReader reader) throws IOException { + if (reader ==null){ + throw new IllegalArgumentException("decodeFieldNo5: reader == null!"); + } + + int field5 = parseNumericField(reader); + } + + + private void decodeWeightVariable(BufferedReader reader) throws IOException { + if (reader ==null){ + throw new IllegalArgumentException("decodeWeightVariable: reader == null!"); + } + + String weightVariableName = parseStringField(reader); + // TODO: make sure case weight variables are properly handled! + // -- L.A. 4.0 beta + ///smd.getFileInformation().put("caseWeightVariableName", weightVariableName); + ///smd.setCaseWeightVariableName(weightVariableName); + } + + + private void decodeVariableInformation(BufferedReader reader) throws IOException { + if (reader ==null){ + throw new IllegalArgumentException("decodeVariableInformation: reader == null!"); + } + + // step 1: variable type + int variableType = parseNumericField(reader); + variableTypelList.add(variableType); + isCurrentVariableString = (variableType > 0); + + + // step 2: variable name + String variableName = parseStringField(reader); + currentVariableName = variableName; + variableNameList.add(variableName); + variableTypeTable.put(variableName,variableType); + + // step 3: format(print/write) + int[] printWriteFormatTable = new int[6]; + for (int i=0; i < 6; i++){ + printWriteFormatTable[i]= parseNumericField(reader); + } + + int formatCode = printWriteFormatTable[0]; + int formatWidth = printWriteFormatTable[1]; + int formatDecimalPointPosition = printWriteFormatTable[2]; + + formatDecimalPointPositionList.add(formatDecimalPointPosition); + if (!SPSSConstants.FORMAT_CODE_TABLE_POR.containsKey(formatCode)){ + throw new IOException("Unknown format code was found = " + formatCode); + } else { + printFormatList.add(printWriteFormatTable[0]); + } + + if (!SPSSConstants.ORDINARY_FORMAT_CODE_SET.contains(formatCode)){ + StringBuilder sb = new StringBuilder(SPSSConstants.FORMAT_CODE_TABLE_POR.get(formatCode) + formatWidth); + if (formatDecimalPointPosition > 0){ + sb.append("."+ formatDecimalPointPosition); + } + printFormatNameTable.put(variableName, sb.toString()); + } + + printFormatTable.put(variableName, SPSSConstants.FORMAT_CODE_TABLE_POR.get(formatCode)); + } + + + private void decodeMissValuePointNumeric(BufferedReader reader) throws IOException { + if (reader ==null){ + throw new IllegalArgumentException("decodeMissValuePointNumeric: reader == null!"); + } + + if (missingValueCodeTable.containsKey(currentVariableName)){ + missingValueCodeTable.get(currentVariableName).add("8"); + } else { + List mvc = new ArrayList(); + mvc.add("8"); + missingValueCodeTable.put(currentVariableName, mvc); + } + + String missingValuePoint=null; + + // missing values are not always integers + String base30value = getNumericFieldAsRawString(reader); + if (base30value.indexOf(".")>=0){ + missingValuePoint = doubleNumberFormatter.format(base30Tobase10Conversion(base30value)); + } else { + missingValuePoint= Integer.valueOf(base30value, 30).toString(); + } + + if (missingValueTable.containsKey(currentVariableName)){ + // already stored + (missingValueTable.get(currentVariableName)).add(missingValuePoint); + } else { + // no missing value stored + List mv = new ArrayList(); + mv.add(missingValuePoint); + missingValueTable.put(currentVariableName, mv); + } + } + + + private void decodeMissValuePointString(BufferedReader reader) throws IOException { + if (reader ==null){ + throw new IllegalArgumentException("decodeMissValuePointString: reader == null!"); + } + + if (missingValueCodeTable.containsKey(currentVariableName)){ + missingValueCodeTable.get(currentVariableName).add("8"); + } else { + List mvc = new ArrayList(); + mvc.add("8"); + missingValueCodeTable.put(currentVariableName, mvc); + } + + String missingValuePointString = parseStringField(reader); + + if (missingValueTable.containsKey(currentVariableName)){ + // already stored + (missingValueTable.get(currentVariableName)).add(missingValuePointString); + } else { + // no missing value stored + List mv = new ArrayList(); + mv.add(missingValuePointString); + missingValueTable.put(currentVariableName, mv); + } + } + + + private void decodeMissValueRangeLow(BufferedReader reader) throws IOException { + if (reader ==null){ + throw new IllegalArgumentException("decodeMissValueRangeLow: reader == null!"); + } + + if (missingValueCodeTable.containsKey(currentVariableName)){ + missingValueCodeTable.get(currentVariableName).add("9"); + } else { + List mvc = new ArrayList(); + mvc.add("9"); + missingValueCodeTable.put(currentVariableName, mvc); + } + + String missingValueRangeLOtype=null; + + // missing values are not always integers + String base30value = getNumericFieldAsRawString(reader); + + if (base30value.indexOf(".")>=0){ + missingValueRangeLOtype = doubleNumberFormatter.format(base30Tobase10Conversion(base30value)); + } else { + missingValueRangeLOtype= Integer.valueOf(base30value, 30).toString(); + } + + if (missingValueTable.containsKey(currentVariableName)){ + // already stored + (missingValueTable.get(currentVariableName)).add("LOWEST"); + (missingValueTable.get(currentVariableName)).add(missingValueRangeLOtype); + } else { + // no missing value stored + List mv = new ArrayList(); + mv.add("LOWEST"); + mv.add(missingValueRangeLOtype); + missingValueTable.put(currentVariableName, mv); + } + } + + + private void decodeMissValueRangeHigh(BufferedReader reader) throws IOException { + if (reader ==null){ + throw new IllegalArgumentException("decodeMissValueRangeHigh: reader == null!"); + } + + if (missingValueCodeTable.containsKey(currentVariableName)){ + missingValueCodeTable.get(currentVariableName).add("A"); + } else { + List mvc = new ArrayList(); + mvc.add("A"); + missingValueCodeTable.put(currentVariableName, mvc); + } + + String missingValueRangeHItype = null; + + // missing values are not always integers + String base30value = getNumericFieldAsRawString(reader); + + if (base30value.indexOf(".")>=0){ + missingValueRangeHItype = doubleNumberFormatter.format(base30Tobase10Conversion(base30value)); + } else { + missingValueRangeHItype= Integer.valueOf(base30value, 30).toString(); + } + + if (missingValueTable.containsKey(currentVariableName)){ + // already stored + (missingValueTable.get(currentVariableName)).add(missingValueRangeHItype); + (missingValueTable.get(currentVariableName)).add("HIGHEST"); + } else { + // no missing value stored + List mv = new ArrayList(); + mv.add(missingValueRangeHItype); + mv.add("HIGHEST"); + missingValueTable.put(currentVariableName, mv); + } + } + + + private void decodeMissValueRange(BufferedReader reader) throws IOException { + if (reader ==null){ + throw new IllegalArgumentException("decodeMissValueRange: reader == null!"); + } + + if (missingValueCodeTable.containsKey(currentVariableName)){ + missingValueCodeTable.get(currentVariableName).add("B"); + } else { + List mvc = new ArrayList(); + mvc.add("B"); + missingValueCodeTable.put(currentVariableName, mvc); + } + + String[] missingValueRange = new String[2]; + + // missing values are not always integers + String base30value0 = getNumericFieldAsRawString(reader); + + if (base30value0.indexOf(".")>=0){ + missingValueRange[0] = doubleNumberFormatter.format(base30Tobase10Conversion(base30value0)); + } else { + missingValueRange[0]= Integer.valueOf(base30value0, 30).toString(); + } + + String base30value1 = getNumericFieldAsRawString(reader); + + if (base30value1.indexOf(".")>=0){ + missingValueRange[1] = doubleNumberFormatter.format(base30Tobase10Conversion(base30value1)); + } else { + missingValueRange[1]= Integer.valueOf(base30value1, 30).toString(); + } + + if (missingValueTable.containsKey(currentVariableName)){ + // already stored + (missingValueTable.get(currentVariableName)).add(missingValueRange[0]); + (missingValueTable.get(currentVariableName)).add(missingValueRange[1]); + } else { + // no missing value stored + List mv = new ArrayList(); + mv.add(missingValueRange[0]); + mv.add(missingValueRange[1]); + missingValueTable.put(currentVariableName, mv); + } + } + + + private void decodeVariableLabel(BufferedReader reader) throws IOException { + if (reader ==null){ + throw new IllegalArgumentException("decodeVariableLabel: reader == null!"); + } + + String variableLabel = parseStringField(reader); + variableLabelMap.put(currentVariableName, variableLabel); + // note: not all variables have their variable label; therefore, + // saving them to the metatadata object is done within read() method + + } + + + private void decodeValueLabel(BufferedReader reader) throws IOException { + Map valueLabelSet = new LinkedHashMap(); + + int numberOfVariables = parseNumericField(reader); + String[] variableNames = new String[numberOfVariables]; + + for (int i= 0; i< numberOfVariables; i++){ + variableNames[i] = parseStringField(reader); + } + + int numberOfvalueLabelSets = parseNumericField(reader); + boolean isStringType = variableTypeTable.get(variableNames[0]) > 0 ? true : false; + + for (int i=0; i dataTableList = new ArrayList(); + List dateFormatList = new ArrayList(); + int[] variableTypeFinal= new int[varQnty]; + + // create a File object to save the tab-delimited data file + File tabDelimitedDataFile = File.createTempFile("tempTabfile.", ".tab"); + ///smd.getFileInformation().put("tabDelimitedDataFileLocation", tabDelimitedDataFile.getAbsolutePath()); + // 4.0: + ingesteddata.setTabDelimitedFile(tabDelimitedDataFile); + + + FileOutputStream fileOutTab = null; + PrintWriter pwout = null; + + try { + fileOutTab = new FileOutputStream(tabDelimitedDataFile); + pwout = new PrintWriter(new OutputStreamWriter(fileOutTab, "utf8"), true); + + variableFormatTypeList = new String[varQnty]; + for (int i = 0; i < varQnty; i++) { + variableFormatTypeList[i] = SPSSConstants.FORMAT_CATEGORY_TABLE.get(printFormatTable.get(variableNameList.get(i))); + formatCategoryTable.put(variableNameList.get(i), variableFormatTypeList[i]); + } + + // contents (variable) checker concering decimals + Arrays.fill(variableTypeFinal, 0); + + // raw-case counter + int j = 0; // case + + // use while instead for because the number of cases (observations) is usually unknown + FBLOCK: while(true){ + j++; + + // case(row)-wise storage object; to be updated after each row-reading + + String[] casewiseRecord = new String[varQnty]; + String[] caseWiseDateFormat = new String[varQnty]; + String[] casewiseRecordForTabFile = new String[varQnty]; + // warning: the above object is later shallow-copied to the + // data object for calculating a UNF value/summary statistics + // + + for (int i=0; i 0 ? true : false; + + if (isStringType){ + // String case + variableTypeFinal[i]=-1; + + StringBuilder sb_StringLengthBase30 = new StringBuilder(""); + int stringLengthBase10 = 0; + String buffer = ""; + char[] tmp = new char[1]; + + int nint; + while((nint = reader.read(tmp))>0){ + buffer = Character.toString(tmp[0]); + if (buffer.equals("/")){ + break; + } else if (buffer.equals("Z")){ + if (i == 0){ + // the reader has passed the last case; subtract 1 from the j counter + caseQnty = j-1; + break FBLOCK; + } + } else { + sb_StringLengthBase30.append(buffer); + } + + + } + + if (nint == 0){ + // no more data to be read (reached the eof) + caseQnty = j - 1; + break FBLOCK; + } + + + dbgLog.finer(j+"-th case "+i+"=th var:datum length=" +sb_StringLengthBase30.toString()); + + // this length value should be a positive integer + Matcher mtr = pattern4positiveInteger.matcher(sb_StringLengthBase30.toString()); + if (mtr.matches()){ + stringLengthBase10 = Integer.valueOf(sb_StringLengthBase30.toString(), 30); + } else{ + // reading error case + throw new IOException("reading F(data) section: string: length is not integer"); + } + + // read this string-variable's contents after "/" + char[] char_datumString = new char[stringLengthBase10]; + reader.read(char_datumString); + + String datum = new String(char_datumString); + casewiseRecord[i]= datum; + casewiseRecordForTabFile[i] = "\"" + datum.replaceAll("\"",Matcher.quoteReplacement("\\\"")) + "\""; + // end of string case + } else { + + // numeric case + StringBuilder sb_datumNumericBase30 = new StringBuilder(""); + boolean isMissingValue = false; + String datum = null; + String datumForTabFile = null; + String datumDateFormat = null; + + String buffer = ""; + char[] tmp = new char[1]; + int nint; + while((nint = reader.read(tmp))>0){ + sb_datumNumericBase30.append(buffer); + buffer = Character.toString(tmp[0]); + + if (buffer.equals("/")){ + break; + } else if (buffer.equals("Z")){ + if (i == 0){ + // the reader has passed the last case + // subtract 1 from the j counter + dbgLog.fine("Z-mark was detected"); + caseQnty = j-1; + break FBLOCK; + } + } else if (buffer.equals("*")) { + // '*' is the first character of the system missing value + datumForTabFile = MissingValueForTextDataFile; + datum = null; + isMissingValue = true; + + // read next char '.' as part of the missing value + reader.read(tmp); + buffer = Character.toString(tmp[0]); + break; + } + + } + if (nint == 0){ + // no more data to be read; reached the eof + caseQnty = j - 1; + break FBLOCK; + } + + // follow-up process for non-missing-values + if (!isMissingValue) { + // decode a numeric datum as String + String datumNumericBase30 = sb_datumNumericBase30.toString(); + Matcher matcher = pattern4Integer.matcher(datumNumericBase30); + + if (matcher.matches()){ + // integer case + datum = Long.valueOf(datumNumericBase30, 30).toString(); + } else { + // double case + datum = doubleNumberFormatter.format(base30Tobase10Conversion(datumNumericBase30)); + } + + // now check format (if date or time) + String variableFormatType = variableFormatTypeList[i]; + + if (variableFormatType.equals("date")){ + variableTypeFinal[i]=-1; + long dateDatum = Long.parseLong(datum)*1000L- SPSS_DATE_OFFSET; + datum = sdf_ymd.format(new Date(dateDatum)); + datumDateFormat = sdf_ymd.toPattern(); + + } else if (variableFormatType.equals("time")) { + variableTypeFinal[i]=-1; + int formatDecimalPointPosition = formatDecimalPointPositionList.get(i); + + if (printFormatTable.get(variableNameList.get(i)).equals("DTIME")){ + + if (datum.indexOf(".") < 0){ + long dateDatum = Long.parseLong(datum)*1000L - SPSS_DATE_BIAS; + datum = sdf_dhms.format(new Date(dateDatum)); + // don't save date format for dtime + } else { + // decimal point included + String[] timeData = datum.split("\\."); + long dateDatum = Long.parseLong(timeData[0])*1000L - SPSS_DATE_BIAS; + StringBuilder sb_time = new StringBuilder(sdf_dhms.format(new Date(dateDatum))); + + if (formatDecimalPointPosition > 0){ + sb_time.append("."+timeData[1].substring(0,formatDecimalPointPosition)); + } + + datum = sb_time.toString(); + // don't save date format for dtime + } + + } else if (printFormatTable.get(variableNameList.get(i)).equals("DATETIME")){ + + if (datum.indexOf(".") < 0){ + long dateDatum = Long.parseLong(datum)*1000L - SPSS_DATE_OFFSET; + datum = sdf_ymdhms.format(new Date(dateDatum)); + datumDateFormat = sdf_ymdhms.toPattern(); + } else { + // decimal point included + String[] timeData = datum.split("\\."); + long dateDatum = Long.parseLong(timeData[0])*1000L- SPSS_DATE_OFFSET; + StringBuilder sb_time = new StringBuilder(sdf_ymdhms.format(new Date(dateDatum))); + + if (formatDecimalPointPosition > 0){ + sb_time.append("."+timeData[1].substring(0,formatDecimalPointPosition)); + } + + datum = sb_time.toString(); + datumDateFormat = sdf_ymdhms.toPattern() + (formatDecimalPointPosition > 0 ? ".S" : "" ); + } + + } else if (printFormatTable.get(variableNameList.get(i)).equals("TIME")){ + + if (datum.indexOf(".") < 0){ + long dateDatum = Long.parseLong(datum)*1000L; + datum = sdf_hms.format(new Date(dateDatum)); + datumDateFormat = sdf_hms.toPattern(); + } else { + // decimal point included + String[] timeData = datum.split("\\."); + long dateDatum = Long.parseLong(timeData[0])*1000L; + StringBuilder sb_time = new StringBuilder(sdf_hms.format(new Date(dateDatum))); + + if (formatDecimalPointPosition > 0){ + sb_time.append("."+timeData[1].substring(0,formatDecimalPointPosition)); + } + + datum = sb_time.toString(); + datumDateFormat = sdf_hms.toPattern() + (formatDecimalPointPosition > 0 ? ".S" : "" ); + } + } + + } else if (variableFormatType.equals("other")){ + + if (printFormatTable.get(variableNameList.get(i)).equals("WKDAY")){ + // day of week + variableTypeFinal[i]=-1; + datum = SPSSConstants.WEEKDAY_LIST.get(Integer.valueOf(datum)-1); + + } else if (printFormatTable.get(variableNameList.get(i)).equals("MONTH")){ + // month + variableTypeFinal[i]=-1; + datum = SPSSConstants.MONTH_LIST.get(Integer.valueOf(datum)-1); + } + } + + // since value is not missing, set both values to be the same + datumForTabFile = datum; + + // decimal-point check (variable is integer or not) + if (variableTypeFinal[i]==0){ + if (datum.indexOf(".") >=0){ + variableTypeFinal[i] = 1; + decimalVariableSet.add(i); + } + } + } + + casewiseRecord[i]= datum; + caseWiseDateFormat[i] = datumDateFormat; + casewiseRecordForTabFile[i]= datumForTabFile; + + } // end: if: string vs numeric variable + + } // end:for-loop-i (variable-wise loop) + + + // print the i-th case; use casewiseRecord to dump the current case to the tab-delimited file + pwout.println(StringUtils.join(casewiseRecordForTabFile, "\t")); + // store the current case-holder object to the data object for later operations such as UNF/summary statistics + dataTableList.add(casewiseRecord); + dateFormatList.add(caseWiseDateFormat); + + } // end: while-block + } finally { + // close the print writer + if (pwout != null) { + pwout.close(); + } + } + + ///smd.setDecimalVariables(decimalVariableSet); + ///smd.getFileInformation().put("caseQnty", caseQnty); + dataTable.setCaseQuantity(new Long(caseQnty)); + + } + + + private void processMissingValueData(){ + /* + + POR's missing-value storage differs form the counterpart of SAV; + this method transforms the POR-native storage to the SAV-type + after this process, missingValueTable contains point-type + missing values for later catStat/sumStat processing; + range and mixed type cases are stored in invalidDataTable + + missingValueCodeTable= + {VAR1=[9], VAR2=[A], VAR3=[9, 8], VAR4=[A, 8], + VAR5=[8, 8, 8], VAR6=[B], VAR7=[B, 8]} + + missingValueTable= + {VAR1=[-1], VAR2=[-1], VAR3=[-2, -1], VAR4=[-1, -2], + VAR5=[-1, -2, -3], VAR6=[-2, -1], VAR7=[-3, -2, -1]} + + + missingValueTable={VAR1=[], VAR2=[], VAR3=[-1], VAR4=[-2], + VAR5=[-1, -2, -3], VAR6=[], VAR7=[-2]} + + */ + + dbgLog.fine("missingValueCodeTable="+missingValueCodeTable); + Set>> msvlc = missingValueCodeTable.entrySet(); + for (Iterator>> itc = msvlc.iterator(); itc.hasNext();){ + Map.Entry> et = itc.next(); + String variable = et.getKey(); + dbgLog.fine("variable="+variable); + List codeList = et.getValue(); + List valueList = missingValueTable.get(variable); + dbgLog.fine("codeList="+codeList); + dbgLog.fine("valueList="+valueList); + int type; + InvalidData invalidDataInfo = null; + if (valueList.size() == 3){ + if (codeList.get(0).equals("8") && codeList.get(1).equals("8") && + codeList.get(2).equals("8") ){ + type = 3; + invalidDataInfo = new InvalidData(type); + invalidDataInfo.setInvalidValues(valueList); + } else if (codeList.get(0).equals("9") && codeList.get(1).equals("8")){ + type = -3; + + invalidDataInfo = new InvalidData(type); + invalidDataInfo.setInvalidValues(valueList.subList(2, 3)); + invalidDataInfo.setInvalidRange(valueList.subList(0, 2)); + + } else if (codeList.get(0).equals("A") && codeList.get(1).equals("8")){ + type = -3; + invalidDataInfo = new InvalidData(type); + invalidDataInfo.setInvalidValues(valueList.subList(2, 3)); + invalidDataInfo.setInvalidRange(valueList.subList(0, 2)); + } else if (codeList.get(0).equals("B") && codeList.get(1).equals("8")){ + type = -3; + invalidDataInfo = new InvalidData(type); + invalidDataInfo.setInvalidValues(valueList.subList(2, 3)); + invalidDataInfo.setInvalidRange(valueList.subList(0, 2)); + } else { + dbgLog.severe("unkown missing-value combination(3 values)"); + } + + } else if (valueList.size() == 2){ + if (codeList.get(0).equals("8") && codeList.get(1).equals("8")){ + type = 2; + invalidDataInfo = new InvalidData(type); + invalidDataInfo.setInvalidValues(valueList); + + } else if (codeList.get(0).equals("9")){ + type = -2; + invalidDataInfo = new InvalidData(type); + invalidDataInfo.setInvalidRange(valueList.subList(0, 2)); + + } else if (codeList.get(0).equals("A")){ + type = -2; + invalidDataInfo = new InvalidData(type); + invalidDataInfo.setInvalidRange(valueList.subList(0, 2)); + } else if (codeList.get(0).equals("B")){ + type = -2; + invalidDataInfo = new InvalidData(type); + invalidDataInfo.setInvalidRange(valueList.subList(0, 2)); + + } else { + dbgLog.severe("unknown missing value combination(2 values)"); + } + } else if (valueList.size() == 1){ + if (codeList.get(0).equals("8")){ + type = 1; + invalidDataInfo = new InvalidData(type); + invalidDataInfo.setInvalidValues(valueList); + } else { + dbgLog.severe("unknown missing value combination(2 values)"); + } + } + invalidDataTable.put(variable, invalidDataInfo); + } + + dbgLog.fine("invalidDataTable="+invalidDataTable); + + + Set>> msvl = missingValueTable.entrySet(); + for (Iterator>> it = msvl.iterator(); it.hasNext();){ + Map.Entry> et = it.next(); + + String variable = et.getKey(); + List valueList = et.getValue(); + + List codeList = missingValueCodeTable.get(variable); + + dbgLog.finer("var="+variable+"\tvalue="+valueList+"\t code"+ codeList); + List temp = new ArrayList(); + for (int j=0; j 0){ + significand = base30StringNoSign.substring(0, plusIndex); + exponent = Long.valueOf( base30StringNoSign.substring(plusIndex+1), oldBase ); + + } else if (minusIndex > 0){ + significand = base30StringNoSign.substring(0, minusIndex); + exponent = -1 * Long.valueOf( base30StringNoSign.substring(minusIndex+1), oldBase ); + + } else { + significand = new String(base30StringNoSign); + } + + + // "move" decimal point; for each shift right, subtract one from exponent; end result is a string with no decimal + int decimalIndex = significand.indexOf("."); + if (decimalIndex != -1) { + exponent -= (significand.length() - (decimalIndex + 1) ); + significand = significand.substring(0, decimalIndex) + significand.substring( decimalIndex + 1 ); + } + + // TODO: Verify that the MathContext/Rounding methods are OK: + // -- L.A. 4.0 beta + MathContext mc = new MathContext(15,RoundingMode.HALF_UP); + long base10Significand = Long.parseLong(significand, oldBase); + BigDecimal base10value = new BigDecimal( String.valueOf(base10Significand), mc ); + BigDecimal exponentialComponent = new BigDecimal("1", mc); + + for (int g=0; g < Math.abs(exponent); g++) { + exponentialComponent = exponentialComponent.multiply(new BigDecimal("30", mc)); + } + + if (exponent >= 0) { + base10value = base10value.multiply(exponentialComponent, mc); + } else { + base10value = base10value.divide(exponentialComponent, mc); + } + + // negative sign if applicable + if (isNegativeNumber){ + base10value = base10value.multiply(new BigDecimal("-1", mc)); + } + + return base10value.doubleValue(); + } + + void assignValueLabels(Map> valueLabelTable) { + // Let's go through all the categorical value label mappings and + // assign them to the correct variables: + + for (int i = 0; i < dataTable.getVarQuantity().intValue(); i++) { + + String varName = dataTable.getDataVariables().get(i).getName(); + + Map valueLabelPairs = valueLabelTable.get(varName); + if (valueLabelPairs != null && !valueLabelPairs.isEmpty()) { + for (String value : valueLabelPairs.keySet()) { + + VariableCategory cat = new VariableCategory(); + cat.setValue(value); + cat.setLabel(valueLabelPairs.get(value)); + + /* cross-link the variable and category to each other: */ + cat.setDataVariable(dataTable.getDataVariables().get(i)); + dataTable.getDataVariables().get(i).getCategories().add(cat); + } + } + } + } + + private void print2Darray(Object[][] datatable, String title){ + dbgLog.fine(title); + for (int i=0; i< datatable.length; i++){ + dbgLog.fine(StringUtils.join(datatable[i], "|")); + } + } + + +} + diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/por/PORFileReaderSpi.java b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/por/PORFileReaderSpi.java new file mode 100644 index 00000000000..4e0175c8183 --- /dev/null +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/por/PORFileReaderSpi.java @@ -0,0 +1,489 @@ +/* + Copyright (C) 2005-2014, by the President and Fellows of Harvard College. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + Dataverse Network - A web application to share, preserve and analyze research data. + Developed at the Institute for Quantitative Social Science, Harvard University. + Version 3.0. +*/ +package edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.por; + +import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataFileReader; +import edu.harvard.iq.dataverse.ingest.tabulardata.spi.TabularDataFileReaderSpi; + +import java.io.*; +import java.nio.*; +import java.nio.channels.*; +import java.util.logging.*; + +import javax.imageio.IIOException; +import java.util.*; + +import org.apache.commons.codec.binary.Hex; + + +/** + * Service Provider registration class for the SPSS/POR ingest plugin. + * Based on the code originally developed by Akio Sone, HMDC/ODUM + * for v.2 of the DVN. + * + * @author Leonid Andreev + * original + * @author Akio Sone + */ +public class PORFileReaderSpi extends TabularDataFileReaderSpi{ + + private static Logger dbgLog = Logger.getLogger( + PORFileReaderSpi.class.getPackage().getName()); + + private static int POR_HEADER_SIZE = 500; + public static int POR_MARK_POSITION_DEFAULT = 461; + public static String POR_MARK = "SPSSPORT"; + + private boolean windowsNewLine = true; + + private static String[] formatNames = {"por", "POR"}; + private static String[] extensions = {"por"}; + private static String[] mimeType = {"application/x-spss-por"}; + + public PORFileReaderSpi() { + super("HU-IQSS-DataVerse-project", + "4.0", + formatNames, + extensions, + mimeType, + "edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.por.PORFileReaderSpi"); + dbgLog.fine("PORFileReaderSpi is called"); + } + + @Override + public boolean canDecodeInput(Object source) throws IOException { + if (!(source instanceof BufferedInputStream)) { + return false; + } + if (source == null){ + throw new IllegalArgumentException("source == null!"); + } + BufferedInputStream stream = (BufferedInputStream)source; + dbgLog.fine("applying the por test\n"); + + byte[] b = new byte[POR_HEADER_SIZE]; + + if (stream.markSupported()){ + stream.mark(0); + } + + int nbytes = stream.read(b, 0, POR_HEADER_SIZE); + + //printHexDump(b, "hex dump of the byte-array"); + + if (nbytes == 0){ + throw new IOException(); + } else if ( nbytes < 491) { + // size test + dbgLog.fine("this file is NOT spss-por type"); + return false; + } + + if (stream.markSupported()){ + stream.reset(); + } + + boolean DEBUG = false; + + //windows [0D0A]=> [1310] = [CR/LF] + //unix [0A] => [10] + //mac [0D] => [13] + // 3char [0D0D0A]=> [131310] spss for windows rel 15 + // expected results + // unix case: [0A] : [80], [161], [242], [323], [404], [485] + // windows case: [0D0A] : [81], [163], [245], [327], [409], [491] + // : [0D0D0A] : [82], [165], [248], [331], [414], [495] + + // convert b into a ByteBuffer + + ByteBuffer buff = ByteBuffer.wrap(b); + byte[] nlch = new byte[36]; + int pos1; + int pos2; + int pos3; + int ucase = 0; + int wcase = 0; + int mcase = 0; + int three = 0; + int nolines = 6; + int nocols = 80; + for (int i = 0; i < nolines; ++i) { + int baseBias = nocols * (i + 1); + // 1-char case + pos1 = baseBias + i; + buff.position(pos1); + dbgLog.finer("\tposition(1)=" + buff.position()); + int j = 6 * i; + nlch[j] = buff.get(); + + if (nlch[j] == 10) { + ucase++; + } else if (nlch[j] == 13) { + mcase++; + } + + // 2-char case + pos2 = baseBias + 2 * i; + buff.position(pos2); + dbgLog.finer("\tposition(2)=" + buff.position()); + + nlch[j + 1] = buff.get(); + nlch[j + 2] = buff.get(); + + // 3-char case + pos3 = baseBias + 3 * i; + buff.position(pos3); + dbgLog.finer("\tposition(3)=" + buff.position()); + + nlch[j + 3] = buff.get(); + nlch[j + 4] = buff.get(); + nlch[j + 5] = buff.get(); + + dbgLog.finer(i + "-th iteration position =" + nlch[j] + "\t" + nlch[j + 1] + "\t" + nlch[j + 2]); + dbgLog.finer(i + "-th iteration position =" + nlch[j + 3] + "\t" + nlch[j + 4] + "\t" + nlch[j + 5]); + + if ((nlch[j + 3] == 13) && (nlch[j + 4] == 13) && (nlch[j + 5] == 10)) { + three++; + } else if ((nlch[j + 1] == 13) && (nlch[j + 2] == 10)) { + wcase++; + } + + buff.rewind(); + } + if (three == nolines) { + dbgLog.fine("0D0D0A case"); + windowsNewLine = false; + } else if ((ucase == nolines) && (wcase < nolines)) { + dbgLog.fine("0A case"); + windowsNewLine = false; + } else if ((ucase < nolines) && (wcase == nolines)) { + dbgLog.fine("0D0A case"); + } else if ((mcase == nolines) && (wcase < nolines)) { + dbgLog.fine("0D case"); + windowsNewLine = false; + } + + + buff.rewind(); + int PORmarkPosition = POR_MARK_POSITION_DEFAULT; + if (windowsNewLine) { + PORmarkPosition = PORmarkPosition + 5; + } else if (three == nolines) { + PORmarkPosition = PORmarkPosition + 10; + } + + byte[] pormark = new byte[8]; + buff.position(PORmarkPosition); + buff.get(pormark, 0, 8); + String pormarks = new String(pormark); + + dbgLog.fine("pormark[hex: 53 50 53 53 50 4F 52 54 == SPSSPORT] =>" + + new String(Hex.encodeHex(pormark)) + "<-"); + + if (pormarks.equals(POR_MARK)) { + dbgLog.fine("this file is spss-por type"); + return true; + } else { + dbgLog.fine("this file is NOT spss-por type"); + } + return false; + } + + + + @Override + public boolean canDecodeInput(BufferedInputStream stream) throws IOException { + if (stream == null){ + throw new IllegalArgumentException("file == null!"); + } + + dbgLog.fine("applying the por test\n"); + + byte[] b = new byte[POR_HEADER_SIZE]; + + if (stream.markSupported()){ + stream.mark(0); + } + + int nbytes = stream.read(b, 0, POR_HEADER_SIZE); + + //printHexDump(b, "hex dump of the byte-array"); + + if (nbytes == 0){ + throw new IOException(); + } else if ( nbytes < 491) { + // size test + dbgLog.fine("this file is NOT spss-por type"); + return false; + } + + if (stream.markSupported()){ + stream.reset(); + } + + boolean DEBUG = false; + + //windows [0D0A]=> [1310] = [CR/LF] + //unix [0A] => [10] + //mac [0D] => [13] + // 3char [0D0D0A]=> [131310] spss for windows rel 15 + // expected results + // unix case: [0A] : [80], [161], [242], [323], [404], [485] + // windows case: [0D0A] : [81], [163], [245], [327], [409], [491] + // : [0D0D0A] : [82], [165], [248], [331], [414], [495] + + // convert b into a ByteBuffer + + ByteBuffer buff = ByteBuffer.wrap(b); + byte[] nlch = new byte[36]; + int pos1; + int pos2; + int pos3; + int ucase = 0; + int wcase = 0; + int mcase = 0; + int three = 0; + int nolines = 6; + int nocols = 80; + for (int i = 0; i < nolines; ++i) { + int baseBias = nocols * (i + 1); + // 1-char case + pos1 = baseBias + i; + buff.position(pos1); + dbgLog.finer("\tposition(1)=" + buff.position()); + int j = 6 * i; + nlch[j] = buff.get(); + + if (nlch[j] == 10) { + ucase++; + } else if (nlch[j] == 13) { + mcase++; + } + + // 2-char case + pos2 = baseBias + 2 * i; + buff.position(pos2); + dbgLog.finer("\tposition(2)=" + buff.position()); + + nlch[j + 1] = buff.get(); + nlch[j + 2] = buff.get(); + + // 3-char case + pos3 = baseBias + 3 * i; + buff.position(pos3); + dbgLog.finer("\tposition(3)=" + buff.position()); + + nlch[j + 3] = buff.get(); + nlch[j + 4] = buff.get(); + nlch[j + 5] = buff.get(); + + dbgLog.finer(i + "-th iteration position =" + nlch[j] + "\t" + nlch[j + 1] + "\t" + nlch[j + 2]); + dbgLog.finer(i + "-th iteration position =" + nlch[j + 3] + "\t" + nlch[j + 4] + "\t" + nlch[j + 5]); + + if ((nlch[j + 3] == 13) && (nlch[j + 4] == 13) && (nlch[j + 5] == 10)) { + three++; + } else if ((nlch[j + 1] == 13) && (nlch[j + 2] == 10)) { + wcase++; + } + + buff.rewind(); + } + if (three == nolines) { + dbgLog.fine("0D0D0A case"); + windowsNewLine = false; + } else if ((ucase == nolines) && (wcase < nolines)) { + dbgLog.fine("0A case"); + windowsNewLine = false; + } else if ((ucase < nolines) && (wcase == nolines)) { + dbgLog.fine("0D0A case"); + } else if ((mcase == nolines) && (wcase < nolines)) { + dbgLog.fine("0D case"); + windowsNewLine = false; + } + + + buff.rewind(); + int PORmarkPosition = POR_MARK_POSITION_DEFAULT; + if (windowsNewLine) { + PORmarkPosition = PORmarkPosition + 5; + } else if (three == nolines) { + PORmarkPosition = PORmarkPosition + 10; + } + + byte[] pormark = new byte[8]; + buff.position(PORmarkPosition); + buff.get(pormark, 0, 8); + String pormarks = new String(pormark); + + //dbgLog.fine("pormark =>" + pormarks + "<-"); + dbgLog.fine("pormark[hex: 53 50 53 53 50 4F 52 54 == SPSSPORT] =>" + + new String(Hex.encodeHex(pormark)) + "<-"); + + if (pormarks.equals(POR_MARK)) { + dbgLog.fine("this file is spss-por type"); + return true; + } else { + dbgLog.fine("this file is NOT spss-por type"); + } + return false; + } + + + @Override + public boolean canDecodeInput(File file) throws IOException { + if (file ==null){ + throw new IllegalArgumentException("file == null!"); + } + if (!file.canRead()){ + throw new IOException("cannot read the input file"); + } + + // set-up a FileChannel instance for a given file object + FileChannel srcChannel = new FileInputStream(file).getChannel(); + + // create a read-only MappedByteBuffer + MappedByteBuffer buff = srcChannel.map(FileChannel.MapMode.READ_ONLY, 0, POR_HEADER_SIZE); + + //printHexDump(buff, "hex dump of the byte-buffer"); + + buff.rewind(); + + boolean DEBUG = false; + + + dbgLog.fine("applying the spss-por test\n"); + + // size test + if (buff.capacity() < 491) { + dbgLog.fine("this file is NOT spss-por type"); + return false; + } + + //windows [0D0A]=> [1310] = [CR/LF] + //unix [0A] => [10] + //mac [0D] => [13] + // 3char [0D0D0A]=> [131310] spss for windows rel 15 + // expected results + // unix case: [0A] : [80], [161], [242], [323], [404], [485] + // windows case: [0D0A] : [81], [163], [245], [327], [409], [491] + // : [0D0D0A] : [82], [165], [248], [331], [414], [495] + + buff.rewind(); + byte[] nlch = new byte[36]; + int pos1; + int pos2; + int pos3; + int ucase = 0; + int wcase = 0; + int mcase = 0; + int three = 0; + int nolines = 6; + int nocols = 80; + for (int i = 0; i < nolines; ++i) { + int baseBias = nocols * (i + 1); + // 1-char case + pos1 = baseBias + i; + buff.position(pos1); + dbgLog.finer("\tposition(1)=" + buff.position()); + int j = 6 * i; + nlch[j] = buff.get(); + + if (nlch[j] == 10) { + ucase++; + } else if (nlch[j] == 13) { + mcase++; + } + + // 2-char case + pos2 = baseBias + 2 * i; + buff.position(pos2); + dbgLog.finer("\tposition(2)=" + buff.position()); + + nlch[j + 1] = buff.get(); + nlch[j + 2] = buff.get(); + + // 3-char case + pos3 = baseBias + 3 * i; + buff.position(pos3); + dbgLog.finer("\tposition(3)=" + buff.position()); + + nlch[j + 3] = buff.get(); + nlch[j + 4] = buff.get(); + nlch[j + 5] = buff.get(); + + dbgLog.finer(i + "-th iteration position =" + nlch[j] + "\t" + nlch[j + 1] + "\t" + nlch[j + 2]); + dbgLog.finer(i + "-th iteration position =" + nlch[j + 3] + "\t" + nlch[j + 4] + "\t" + nlch[j + 5]); + + if ((nlch[j + 3] == 13) && (nlch[j + 4] == 13) && (nlch[j + 5] == 10)) { + three++; + } else if ((nlch[j + 1] == 13) && (nlch[j + 2] == 10)) { + wcase++; + } + + buff.rewind(); + } + if (three == nolines) { + dbgLog.fine("0D0D0A case"); + windowsNewLine = false; + } else if ((ucase == nolines) && (wcase < nolines)) { + dbgLog.fine("0A case"); + windowsNewLine = false; + } else if ((ucase < nolines) && (wcase == nolines)) { + dbgLog.fine("0D0A case"); + } else if ((mcase == nolines) && (wcase < nolines)) { + dbgLog.fine("0D case"); + windowsNewLine = false; + } + + + buff.rewind(); + int PORmarkPosition = POR_MARK_POSITION_DEFAULT; + if (windowsNewLine) { + PORmarkPosition = PORmarkPosition + 5; + } else if (three == nolines) { + PORmarkPosition = PORmarkPosition + 10; + } + + byte[] pormark = new byte[8]; + buff.position(PORmarkPosition); + buff.get(pormark, 0, 8); + String pormarks = new String(pormark); + + dbgLog.fine("pormark =>" + pormarks + "<-"); + + + if (pormarks.equals(POR_MARK)) { + dbgLog.fine("this file is spss-por type"); + return true; + } else { + dbgLog.fine("this file is NOT spss-por type"); + } + return false; + } + + public String getDescription(Locale locale) { + return "HU-IQSS-DataVerse-project SPSS/POR (\"portable\") File Ingest plugin"; + } + + @Override + public TabularDataFileReader createReaderInstance(Object ext) throws IIOException { + return new PORFileReader(this); + } +} diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/sav/SAVFileReader.java b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/sav/SAVFileReader.java new file mode 100644 index 00000000000..b61b0a1536f --- /dev/null +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/sav/SAVFileReader.java @@ -0,0 +1,3536 @@ +/* + Copyright (C) 2005-2012, by the President and Fellows of Harvard College. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + Dataverse Network - A web application to share, preserve and analyze research data. + Developed at the Institute for Quantitative Social Science, Harvard University. + Version 3.0. +*/ +package edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.sav; + +import java.io.*; +import java.nio.*; +import java.util.logging.*; + +import java.util.*; +import java.util.regex.*; +import java.text.*; + + +import org.apache.commons.lang.*; +import org.apache.commons.codec.binary.Hex; +import javax.inject.Inject; +import javax.naming.Context; +import javax.naming.InitialContext; +import javax.naming.NamingException; + +import edu.harvard.iq.dataverse.DataTable; +import edu.harvard.iq.dataverse.datavariable.DataVariable; +import edu.harvard.iq.dataverse.datavariable.VariableCategory; +import edu.harvard.iq.dataverse.datavariable.VariableFormatType; +import edu.harvard.iq.dataverse.datavariable.VariableServiceBean; + +import edu.harvard.iq.dataverse.ingest.plugin.spi.*; +import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataFileReader; +import edu.harvard.iq.dataverse.ingest.tabulardata.spi.TabularDataFileReaderSpi; +import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest; +import edu.harvard.iq.dataverse.ingest.tabulardata.InvalidData; + + + +/** + * ingest plugin for SPSS SAV file format. + * + * This reader plugin has been fully re-implemented for the DVN 4.0; + * It is still borrows heavily from, and builds on the basis of the + * old implementation by Akio Sone, that was in use in the versions + * 2-3 of the DVN. + * + * @author Akio Sone at UNC-Odum + * @author Leonid Andreev + */ + +public class SAVFileReader extends TabularDataFileReader{ + @Inject + VariableServiceBean varService; + + // static fields ---------------------------------------------------------// + private static String[] FORMAT_NAMES = {"sav", "SAV"}; + private static String[] EXTENSIONS = {"sav"}; + private static String[] MIME_TYPE = {"application/x-spss-sav"}; + + private static final int LENGTH_SAV_INT_BLOCK = 4; + // note: OBS block is either double or String, not Integer + private static final int LENGTH_SAV_OBS_BLOCK = 8; + + private static final int SAV_MAGIC_NUMBER_LENGTH = LENGTH_SAV_INT_BLOCK; + + private static String SAV_FILE_SIGNATURE = "$FL2"; + + + + // Record Type 1 fields + private static final int LENGTH_RECORDTYPE1 = 172; + + private static final int LENGTH_SPSS_PRODUCT_INFO = 60; + + private static final int FILE_LAYOUT_CONSTANT = 2; + + private static final int LENGTH_FILE_LAYOUT_CODE = LENGTH_SAV_INT_BLOCK; + + private static final int LENGTH_NUMBER_OF_OBS_UNITS_PER_CASE = LENGTH_SAV_INT_BLOCK; + + private static final int LENGTH_COMPRESSION_SWITCH = LENGTH_SAV_INT_BLOCK; + + private static final int LENGTH_CASE_WEIGHT_VARIABLE_INDEX = LENGTH_SAV_INT_BLOCK; + + private static final int LENGTH_NUMBER_OF_CASES = LENGTH_SAV_INT_BLOCK; + + private static final int LENGTH_COMPRESSION_BIAS = LENGTH_SAV_OBS_BLOCK; + + private static final int LENGTH_FILE_CREATION_INFO = 84; + + private static final int length_file_creation_date = 9; + private static final int length_file_creation_time = 8; + private static final int length_file_creation_label= 64; + private static final int length_file_creation_padding = 3; + + // Recorde Type 2 + + private static final int LENGTH_RECORDTYPE2_FIXED = 32; + private static final int LENGTH_RECORD_TYPE2_CODE = 4; + private static final int LENGTH_TYPE_CODE = 4; + private static final int LENGTH_LABEL_FOLLOWS = 4; + private static final int LENGTH_MISS_VALUE_FORMAT_CODE= 4; + private static final int LENGTH_PRINT_FORMAT_CODE = 4;; + private static final int LENGTH_WRITE_FORMAT_CODE = 4; + private static final int LENGTH_VARIABLE_NAME = 8; + private static final int LENGTH_VARIABLE_LABEL= 4; + + private static final int LENGTH_MISS_VAL_OBS_CODE = LENGTH_SAV_OBS_BLOCK; + + // Record Type 3/4 + private static final int LENGTH_RECORDTYPE3_HEADER_CODE = 4; + private static final int LENGTH_RECORD_TYPE3_CODE = 4; + private static final int LENGTH_RT3_HOW_MANY_LABELS = 4; + private static final int LENGTH_RT3_VALUE = LENGTH_SAV_OBS_BLOCK; + private static final int LENGTH_RT3_LABEL_LENGTH =1; + + private static final int LENGTH_RECORD_TYPE4_CODE = 4; + private static final int LENGTH_RT4_HOW_MANY_VARIABLES = 4; + private static final int LENGTH_RT4_VARIABLE_INDEX = 4; + + // Record Type 6 + private static final int LENGTH_RECORD_TYPE6_CODE = 4; + private static final int LENGTH_RT6_HOW_MANY_LINES = 4; + private static final int LENGTH_RT6_DOCUMENT_LINE = 80; + + // Record Type 7 + private static final int LENGTH_RECORD_TYPE7_CODE = 4; + private static final int LENGTH_RT7_SUB_TYPE_CODE = 4; + + // Record Type 999 + private static final int LENGTH_RECORD_TYPE999_CODE = 4; + private static final int LENGTH_RT999_FILLER = 4; + + + private static final List RecordType7SubType4Fields= new ArrayList(); + private static final Set validMissingValueCodeSet = new HashSet(); + private static final Map missingValueCodeUnits = new HashMap(); + + private static double SYSMIS_LITTLE =0xFFFFFFFFFFFFEFFFL; + private static double SYSMIS_BIG =0xFFEFFFFFFFFFFFFFL; + + private static Calendar GCO = new GregorianCalendar(); + + static { + + // initialize validMissingValueCodeSet + validMissingValueCodeSet.add(3); + validMissingValueCodeSet.add(2); + validMissingValueCodeSet.add(1); + validMissingValueCodeSet.add(0); + validMissingValueCodeSet.add(-2); + validMissingValueCodeSet.add(-3); + + // initialize missingValueCodeUnits + + missingValueCodeUnits.put(1, 1); + missingValueCodeUnits.put(2, 2); + missingValueCodeUnits.put(3, 3); + missingValueCodeUnits.put(-2,2); + missingValueCodeUnits.put(-3, 3); + missingValueCodeUnits.put(0, 0); + + RecordType7SubType4Fields.add("SYSMIS"); + RecordType7SubType4Fields.add("HIGHEST"); + RecordType7SubType4Fields.add("LOWEST"); + + // set the origin of GCO to 1582-10-15 + GCO.set(1, 1582);// year + GCO.set(2, 9); // month + GCO.set(5, 15);// day of month + GCO.set(9, 0);// AM(0) or PM(1) + GCO.set(10, 0);// hh + GCO.set(12, 0);// mm + GCO.set(13, 0);// ss + GCO.set(14, 0); // SS millisecond + GCO.set(15, 0);// z + } + + private static final long SPSS_DATE_BIAS = 60*60*24*1000; + + private static final long SPSS_DATE_OFFSET = SPSS_DATE_BIAS + Math.abs(GCO.getTimeInMillis()); + + + // instance fields -------------------------------------------------------// + private static String unfVersionNumber = "6"; + + // instance fields -------------------------------------------------------// + + private static Logger dbgLog = Logger.getLogger(SAVFileReader.class.getPackage().getName()); + + + TabularDataIngest ingesteddata = new TabularDataIngest(); + private DataTable dataTable = new DataTable(); + + Map shortToLongVariableNameTable = new LinkedHashMap(); + Map formatCategoryTable = new LinkedHashMap(); + + + + private boolean isLittleEndian = false; + private boolean isDataSectionCompressed = true; + + private Map OBSIndexToVariableName = + new LinkedHashMap(); + + private int OBSUnitsPerCase; + + private List variableTypelList= new ArrayList(); + private List OBSwiseTypelList= new ArrayList(); + + Map printFormatTable = new LinkedHashMap(); + + + Set obsNonVariableBlockSet = new LinkedHashSet(); + + + Map valueVariableMappingTable = new LinkedHashMap(); + + Map extendedVariablesSizeTable = new LinkedHashMap(); + + + List variableNameList = new ArrayList(); + + + Map invalidDataTable = new LinkedHashMap(); // this variable used in 2 methods; only one uses it to set the smd value -- ?? + + NumberFormat doubleNumberFormatter = new DecimalFormat(); + + Set decimalVariableSet = new HashSet(); + + String[] variableFormatTypeList= null; + + List formatDecimalPointPositionList= new ArrayList(); + + + int caseWeightVariableOBSIndex = 0; + + + // date/time data formats + + private SimpleDateFormat sdf_ymd = new SimpleDateFormat("yyyy-MM-dd"); + private SimpleDateFormat sdf_ymdhms = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + private SimpleDateFormat sdf_dhms = new SimpleDateFormat("DDD HH:mm:ss"); + private SimpleDateFormat sdf_hms = new SimpleDateFormat("HH:mm:ss"); + + + Map OBSTypeHexValue = new LinkedHashMap(); + + + /* We should be defaulting to ISO-Latin, NOT US-ASCII! -- L.A. */ + private String defaultCharSet = "ISO-8859-1"; + private int spssVersionNumber = 0; + + + /** + * The String that represents the numeric missing value + * in the final tab-delimited data file. + */ + private String MissingValueForTextDataFileNumeric = ""; + + + public String getMissingValueForTextDataFileNumeric() { + return MissingValueForTextDataFileNumeric; + } + + + public void setMissingValueForTextDataFileNumeric(String MissingValueToken) { + this.MissingValueForTextDataFileNumeric = MissingValueToken; + } + + + String MissingValueForTextDataFileString = ""; + + + public String getMissingValueForTextDataFileString() { + return MissingValueForTextDataFileString; + } + + + public void setMissingValueForTextDataFileString(String MissingValueToken) { + this.MissingValueForTextDataFileString = MissingValueToken; + } + + + public SAVFileReader(TabularDataFileReaderSpi originator){ + super(originator); + } + + // Methods ---------------------------------------------------------------// + + private void init() throws IOException { + + Context ctx = null; + try { + ctx = new InitialContext(); + varService = (VariableServiceBean) ctx.lookup("java:global/dataverse-4.0/VariableServiceBean"); + } catch (NamingException nex) { + try { + ctx = new InitialContext(); + varService = (VariableServiceBean) ctx.lookup("java:global/dataverse/VariableServiceBean"); + } catch (NamingException nex2) { + if (dbgLog.isLoggable(Level.INFO)) dbgLog.info("Could not look up initial context, or the variable service in JNDI!"); + throw new IOException ("Could not look up initial context, or the variable service in JNDI!"); + } + } + + sdf_ymd.setTimeZone(TimeZone.getTimeZone("GMT")); + sdf_ymdhms.setTimeZone(TimeZone.getTimeZone("GMT")); + sdf_dhms.setTimeZone(TimeZone.getTimeZone("GMT")); + sdf_hms.setTimeZone(TimeZone.getTimeZone("GMT")); + + doubleNumberFormatter.setGroupingUsed(false); + doubleNumberFormatter.setMaximumFractionDigits(340); + + if (getDataLanguageEncoding() != null) { + defaultCharSet = getDataLanguageEncoding(); + } + } + + public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws IOException{ + dbgLog.info("SAVFileReader: read() start"); + + if (dataFile != null) { + throw new IOException ("this plugin does not support external raw data files"); + } + + /* ingest happens here ... */ + + // the following methods are now executed, in this order: + + // decodeHeader -- this method doesn't read any [meta]data and + // doesn't initialize any values; its only purpose is to + // make sure that the file is indeed an SPSS/SAV file. + // + // decodeRecordType1 -- there's always one RT1 record; it is + // always 176 byte long. it contains the very basic metadata + // about the data file. most notably, the number of observations + // and the number of OBS (8 byte values) per observation. + // + // decodeRecordType2 -- there are multiple RT2 records. there's + // one RT2 for every OBS (8 byte value); i.e. one per variable, + // or more per every String variable split into multiple OBS + // segments. this one is a 400 line method, that may benefit + // from being split into smaller methods. + // + // decodeRecordType3and4 -- these sections come in pairs, each + // pair dedicated to one set of variable labels. + // decodeRecordType6, + // + // decodeRecordType7 -- this RT contains some extended + // metadata for the data file. (including the information + // about the extended variables, i.e. variables longer than + // 255 bytes split into 255 byte fragments that are stored + // in the data file as independent variables). + // + // decodeRecordType999 -- this RT does not contain any data; + // its sole function is to indicate that the metadata portion + // of the data file is over and the data section follows. + // + // decodeRecordTypeData -- this method decodes the data section + // of the file. Inside this method, 2 distinct methods are + // called to process compressed or uncompressed data, depending + // on which method is used in this data file. + + + String methodCurrentlyExecuted = null; + + try { + methodCurrentlyExecuted = "decodeHeader"; + dbgLog.fine("***** SAVFileReader: executing method decodeHeader"); + decodeHeader(stream); + + methodCurrentlyExecuted = "decodeRecordType1"; + dbgLog.fine("***** SAVFileReader: executing method decodeRecordType1"); + decodeRecordType1(stream); + + methodCurrentlyExecuted = "decodeRecordType2"; + dbgLog.fine("***** SAVFileReader: executing method decodeRecordType1"); + decodeRecordType2(stream); + + methodCurrentlyExecuted = "decodeRecordType3and4"; + dbgLog.fine("***** SAVFileReader: executing method decodeRecordType3and4"); + decodeRecordType3and4(stream); + + methodCurrentlyExecuted = "decodeRecordType6"; + dbgLog.fine("***** SAVFileReader: executing method decodeRecordType6"); + decodeRecordType6(stream); + + methodCurrentlyExecuted = "decodeRecordType7"; + dbgLog.fine("***** SAVFileReader: executing method decodeRecordType7"); + decodeRecordType7(stream); + + methodCurrentlyExecuted = "decodeRecordType999"; + dbgLog.fine("***** SAVFileReader: executing method decodeRecordType999"); + decodeRecordType999(stream); + + methodCurrentlyExecuted = "decodeRecordTypeData"; + dbgLog.fine("***** SAVFileReader: executing method decodeRecordTypeData"); + decodeRecordTypeData(stream); + + + } catch (IllegalArgumentException e) { + //Throwable cause = e.getCause(); + dbgLog.fine("***** SAVFileReader: ATTENTION: IllegalArgumentException thrown while executing "+methodCurrentlyExecuted); + e.printStackTrace(); + throw new IllegalArgumentException ( "in method "+methodCurrentlyExecuted+": "+e.getMessage() ); + } catch (IOException e) { + dbgLog.fine("***** SAVFileReader: ATTENTION: IOException thrown while executing "+methodCurrentlyExecuted); + e.printStackTrace(); + throw new IOException ( "in method "+methodCurrentlyExecuted+": "+e.getMessage() ); + } + + /* + * Final variable type assignments; + * TODO: (maybe?) + * Instead of doing it here, perhaps all the type assignments need to + * be done on DataVariable objects directly; without relying on + * maps and lists here... -- L.A. 4.0 beta (?) + */ + + for (int indx = 0; indx < variableTypelList.size(); indx++) { + String varName = dataTable.getDataVariables().get(indx).getName(); + int simpleType = 0; + if (variableTypelList.get(indx) != null) { + simpleType = variableTypelList.get(indx).intValue(); + } + + if (simpleType <= 0) { + // We need to make one last type adjustment: + // Dates and Times will be stored as character values in the + // dataverse tab files; even though they are not typed as + // strings at this point: + // TODO: + // Make sure the date/time format is properly preserved! + // (see the setFormatCategory below... but double-check!) + // -- L.A. 4.0 alpha + String variableFormatType = variableFormatTypeList[indx]; + if (variableFormatType != null + && (variableFormatType.equals("time") + || variableFormatType.equals("date"))) { + ///variableTypeMinimal[indx] = 1; + simpleType = 1; + + String formatCategory = formatCategoryTable.get(varName); + + if (formatCategory != null) { + dataTable.getDataVariables().get(indx).setFormatCategory(formatCategory); + } + } + } + + // OK, we can now assign the types: + + if (simpleType > 0) { + // String: + dataTable.getDataVariables().get(indx).setVariableFormatType(varService.findVariableFormatTypeByName("character")); + dataTable.getDataVariables().get(indx).setVariableIntervalType(varService.findVariableIntervalTypeByName("discrete")); + } else { + // Numeric: + dataTable.getDataVariables().get(indx).setVariableFormatType(varService.findVariableFormatTypeByName("numeric")); + // discrete or continuous? + // "decimal variables" become dataverse data variables of interval type "continuous": + + if (decimalVariableSet.contains(indx)) { + dataTable.getDataVariables().get(indx).setVariableIntervalType(varService.findVariableIntervalTypeByName("continuous")); + } else { + dataTable.getDataVariables().get(indx).setVariableIntervalType(varService.findVariableIntervalTypeByName("discrete")); + } + + } + + // TODO: take care of the SPSS "shortToLongVariableNameTable" + // mapping before returning the ingested data object. -- 4.0 alpha + // (done, below - but verify!) + + if (shortToLongVariableNameTable.containsKey(varName)) { + String longName = shortToLongVariableNameTable.get(varName); + if (longName != null && !longName.equals("")) { + dataTable.getDataVariables().get(indx).setName(longName); + } + } + + } + + ingesteddata.setDataTable(dataTable); + + dbgLog.info("SAVFileReader: read() end"); + return ingesteddata; + } + + void decodeHeader(BufferedInputStream stream) throws IOException { + dbgLog.fine("decodeHeader(): start"); + + if (stream ==null){ + throw new IllegalArgumentException("stream == null!"); + } + // the length of the magic number is 4 (1-byte character * 4) + // its value is expected to be $FL2 + + byte[] b = new byte[SAV_MAGIC_NUMBER_LENGTH]; + + try { + if (stream.markSupported()){ + stream.mark(100); + } + int nbytes = stream.read(b, 0, SAV_MAGIC_NUMBER_LENGTH); + + if (nbytes == 0){ + throw new IOException(); + } + + } catch (IOException ex){ + //ex.printStackTrace(); + throw ex; + } + + //printHexDump(b, "hex dump of the byte-array"); + + String hdr4sav = new String(b); + dbgLog.fine("from string=" + hdr4sav); + + if (hdr4sav.equals(SAV_FILE_SIGNATURE)) { + dbgLog.fine("this file is spss-sav type"); + // initialize version-specific parameter + init(); + + dataTable.setOriginalFileFormat(MIME_TYPE[0]); + + dataTable.setUnf("UNF:6:NOTCALCULATED"); + + + } else { + dbgLog.fine("this file is NOT spss-sav type"); + + throw new IllegalArgumentException("given file is not spss-sav type"); + } + + // TODO: + // Decide what to do with the charset, where should it be stored? + // -- 4.0 alpha + //4.0//smd.getFileInformation().put("charset", defaultCharSet); + dbgLog.fine("***** decodeHeader(): end *****"); + + } + + + void decodeRecordType1(BufferedInputStream stream) throws IOException { + dbgLog.fine("***** decodeRecordType1(): start *****"); + + if (stream ==null){ + throw new IllegalArgumentException("stream == null!"); + } + // how to read each recordType + // 1. set-up the following objects before reading bytes + // a. the working byte array + // b. the storage object + // the length of this field: 172bytes = 60 + 4 + 12 + 4 + 8 + 84 + // this field consists of 6 distinct blocks + + byte[] recordType1 = new byte[LENGTH_RECORDTYPE1]; + // int caseWeightVariableOBSIndex = 0; + + try { + int nbytes = stream.read(recordType1, 0, LENGTH_RECORDTYPE1); + + + //printHexDump(recordType1, "recordType1"); + + if (nbytes == 0){ + throw new IOException("reading recordType1: no byte was read"); + } + + // 1.1 60 byte-String that tells the platform/version of SPSS that + // wrote this file + + int offset_start = 0; + int offset_end = LENGTH_SPSS_PRODUCT_INFO; // 60 bytes + + String productInfo = new String(Arrays.copyOfRange(recordType1, offset_start, + offset_end),"US-ASCII"); + + dbgLog.fine("productInfo:\n"+productInfo+"\n"); + + // try to parse out the SPSS version that created this data + // file: + + String spssVersionNumberTag = null; + + String regexpVersionNumber = ".*Release ([0-9]*)"; + Pattern versionTagPattern = Pattern.compile(regexpVersionNumber); + Matcher matcher = versionTagPattern.matcher(productInfo); + if ( matcher.find() ) { + spssVersionNumberTag = matcher.group(1); + dbgLog.fine("SPSS Version Number: "+spssVersionNumberTag); + dataTable.setOriginalFormatVersion(spssVersionNumberTag); + } + + if (spssVersionNumberTag != null && !spssVersionNumberTag.equals("")) { + spssVersionNumber = Integer.valueOf(spssVersionNumberTag).intValue(); + + + /* + * Starting with SPSS version 16, the default encoding is + * UTF-8. + * But we are only going to use it if the user did not explicitly + * specify the encoding on the addfiles page. Then we'd want + * to stick with whatever they entered. + */ + if (spssVersionNumber > 15) { + if (getDataLanguageEncoding() == null) { + defaultCharSet = "UTF-8"; + } + } + } + + // TODO: + // decide what to do with the charset? -- 4.0 alpha + //4.0//smd.getFileInformation().put("charset", defaultCharSet); + + // 1.2) 4-byte file-layout-code (byte-order) + + offset_start = offset_end; + offset_end += LENGTH_FILE_LAYOUT_CODE; // 4 byte + + ByteBuffer bb_fileLayout_code = ByteBuffer.wrap( + recordType1, offset_start, LENGTH_FILE_LAYOUT_CODE); + + ByteBuffer byteOderTest = bb_fileLayout_code.duplicate(); + // interprete the 4 byte as int + + int int2test = byteOderTest.getInt(); + + if (int2test == 2 || int2test == 3){ + dbgLog.fine("integer == "+int2test+": the byte-oder of the writer is the same "+ + "as the counterpart of Java: Big Endian"); + } else { + // Because Java's byte-order is always big endian, + // this(!=2) means this sav file was written on a little-endian machine + // non-string, multi-bytes blocks must be byte-reversed + + bb_fileLayout_code.order(ByteOrder.LITTLE_ENDIAN); + + int2test = bb_fileLayout_code.getInt(); + + if (int2test == 2 || int2test == 3){ + dbgLog.fine("The sav file was saved on a little endian machine"); + dbgLog.fine("Reveral of the bytes is necessary to decode "+ + "multi-byte, non-string blocks"); + + isLittleEndian = true; + + } else { + throw new IOException("reading recordType1:unknown file layout code="+int2test); + } + } + + dbgLog.fine("Endian of this platform:"+ByteOrder.nativeOrder().toString()); + + // 1.3 4-byte Number_Of_OBS_Units_Per_Case + // (= how many RT2 records => how many varilables) + + offset_start = offset_end; + offset_end += LENGTH_NUMBER_OF_OBS_UNITS_PER_CASE; // 4 byte + + ByteBuffer bb_OBS_units_per_case = ByteBuffer.wrap( + recordType1, offset_start,LENGTH_NUMBER_OF_OBS_UNITS_PER_CASE); + + if (isLittleEndian){ + bb_OBS_units_per_case.order(ByteOrder.LITTLE_ENDIAN); + } + + + OBSUnitsPerCase = bb_OBS_units_per_case.getInt(); + + dbgLog.fine("RT1: OBSUnitsPerCase="+OBSUnitsPerCase); + + // 1.4 4-byte Compression_Switch + + offset_start = offset_end; + offset_end += LENGTH_COMPRESSION_SWITCH; // 4 byte + + ByteBuffer bb_compression_switch = ByteBuffer.wrap(recordType1, + offset_start, LENGTH_COMPRESSION_SWITCH); + + if (isLittleEndian){ + bb_compression_switch.order(ByteOrder.LITTLE_ENDIAN); + } + + int compression_switch = bb_compression_switch.getInt(); + if ( compression_switch == 0){ + // data section is not compressed + isDataSectionCompressed = false; + dbgLog.fine("data section is not compressed"); + } else { + dbgLog.fine("data section is compressed:"+compression_switch); + } + + // 1.5 4-byte Case-Weight Variable Index + // warning: this variable index starts from 1, not 0 + + offset_start = offset_end; + offset_end += LENGTH_CASE_WEIGHT_VARIABLE_INDEX; // 4 byte + + ByteBuffer bb_Case_Weight_Variable_Index = ByteBuffer.wrap(recordType1, + offset_start, LENGTH_CASE_WEIGHT_VARIABLE_INDEX); + + if (isLittleEndian){ + bb_Case_Weight_Variable_Index.order(ByteOrder.LITTLE_ENDIAN); + } + + caseWeightVariableOBSIndex = bb_Case_Weight_Variable_Index.getInt(); + + /// caseWeightVariableOBSIndex will be used later on to locate + /// the weight variable; so we'll be able to mark the corresponding + /// variables properly. + // TODO: make sure case weight variables are properly handled! + // -- L.A. 4.0 beta + ///smd.getFileInformation().put("caseWeightVariableOBSIndex", caseWeightVariableOBSIndex); + + // 1.6 4-byte Number of Cases + + offset_start = offset_end; + offset_end += LENGTH_NUMBER_OF_CASES; // 4 byte + + ByteBuffer bb_Number_Of_Cases = ByteBuffer.wrap(recordType1, + offset_start, LENGTH_NUMBER_OF_CASES); + + if (isLittleEndian){ + bb_Number_Of_Cases.order(ByteOrder.LITTLE_ENDIAN); + } + + Long numberOfCases = bb_Number_Of_Cases.getLong(); + + if ( numberOfCases < 0){ + // -1 if numberOfCases is unknown + throw new RuntimeException("number of cases is not recorded in the header"); + } else { + dbgLog.fine("RT1: number of cases is recorded= "+numberOfCases); + dataTable.setCaseQuantity(numberOfCases); + ///caseQnty = numberOfCases; + ///smd.getFileInformation().put("caseQnty", numberOfCases); + } + + // 1.7 8-byte compression-bias [not long but double] + + offset_start = offset_end; + offset_end += LENGTH_COMPRESSION_BIAS; // 8 byte + + ByteBuffer bb_compression_bias = ByteBuffer.wrap( + Arrays.copyOfRange(recordType1, offset_start, + offset_end)); + + if (isLittleEndian){ + bb_compression_bias.order(ByteOrder.LITTLE_ENDIAN); + } + + Double compressionBias = bb_compression_bias.getDouble(); + + // TODO: + // check if this "compression bias" is being used anywhere? + // doesn't seem to be! + // -- 4.0 alpha + if ( compressionBias == 100d){ + // 100 is expected + dbgLog.fine("compressionBias is 100 as expected"); + ///smd.getFileInformation().put("compressionBias", 100); + } else { + dbgLog.fine("compression bias is not 100: "+ compressionBias); + ///smd.getFileInformation().put("compressionBias", compressionBias); + } + + + // 1.8 84-byte File Creation Information (date/time: dd MM yyhh:mm:ss + + // 64-bytelabel) + + offset_start = offset_end; + offset_end += LENGTH_FILE_CREATION_INFO; // 84 bytes + + String fileCreationInfo = getNullStrippedString(new String(Arrays.copyOfRange(recordType1, offset_start, + offset_end),"US-ASCII")); + + dbgLog.fine("fileCreationInfo:\n"+fileCreationInfo+"\n"); + + String fileCreationDate = fileCreationInfo.substring(0,length_file_creation_date); + int dateEnd = length_file_creation_date+length_file_creation_time; + String fileCreationTime = fileCreationInfo.substring(length_file_creation_date, + (dateEnd)); + String fileCreationNote = fileCreationInfo.substring(dateEnd,length_file_creation_label); + + + dbgLog.fine("fileDate="+ fileCreationDate); + dbgLog.fine("fileTime="+ fileCreationTime); + dbgLog.fine("fileNote"+ fileCreationNote); + + // 4.0 - my comments from the DTA reader: + /* All these time/date stamps - I don't think we are using + * them anywhere. -- L.A. 4.0 + */ + /* As for the "varformat schema" - storing this information was + * largely redundant, since we know that all the variables in + * this data table come from a Stata file. -- L.A. 4.0 + */ + ///smd.getFileInformation().put("fileDate", fileCreationDate); + ///smd.getFileInformation().put("fileTime", fileCreationTime); + ///smd.getFileInformation().put("fileNote", fileCreationNote); + ///smd.getFileInformation().put("varFormat_schema", "SPSS"); + + + /// mime type has already been set on the newly created dataTable, + /// earlier. + //smd.getFileInformation().put("mimeType", MIME_TYPE[0]); + //smd.getFileInformation().put("fileFormat", MIME_TYPE[0]); + + ///smd.setValueLabelMappingTable(valueVariableMappingTable); + + + } catch (IOException ex) { + throw ex; + } + + dbgLog.fine("decodeRecordType1(): end"); + } + + + void decodeRecordType2(BufferedInputStream stream) throws IOException { + dbgLog.fine("decodeRecordType2(): start"); + if (stream ==null){ + throw new IllegalArgumentException("stream == null!"); + } + + Map printFormatNameTable = new LinkedHashMap(); + Map variableLabelMap = new LinkedHashMap(); + Map> missingValueTable = new LinkedHashMap>(); + List printFormatList = new ArrayList(); + + String caseWeightVariableName = null; + int caseWeightVariableIndex = 0; + + + boolean lastVariableIsExtendable = false; + boolean extendedVariableMode = false; + boolean obs255 = false; + + String lastVariableName = null; + String lastExtendedVariable = null; + + + // this field repeats as many as the number of variables in + // this sav file + + // (note that the above statement is not technically correct, this + // record repeats not just for every variable in the file, but for + // every OBS (8 byte unit); i.e., if a string is split into multiple + // OBS units, each one will have its own RT2 record -- L.A.). + + // Each field constists of a fixed (32-byte) segment and + // then a few variable segments: + // if the variable has a label (3rd INT4 set to 1), then there's 4 more + // bytes specifying the length of the label, and then that many bytes + // holding the label itself (no more than 256). + // Then if there are optional missing value units (4th INT4 set to 1) + // there will be 3 more OBS units attached = 24 extra bytes. + + int variableCounter = 0; + int obsSeqNumber = 0; + + int j; + + dbgLog.fine("RT2: Reading "+OBSUnitsPerCase+" OBS units."); + + for (j=0; j 0: String;-1 continue )="+recordType2FixedPart1[1]); + + //OBSwiseTypelList.add(recordType2FixedPart1[1]); + + int HowManyRt2Units=1; + + + if (recordType2FixedPart1[1] == -1) { + dbgLog.fine("this RT2 is an 8 bit continuation chunk of an earlier string variable"); + if ( obs255 ) { + if ( obsSeqNumber < 30 ) { + OBSwiseTypelList.add(recordType2FixedPart1[1]); + obsSeqNumber++; + } else { + OBSwiseTypelList.add(-2); + obs255 = false; + obsSeqNumber = 0; + } + } else { + OBSwiseTypelList.add(recordType2FixedPart1[1]); + } + + obsNonVariableBlockSet.add(j); + continue; + } else if (recordType2FixedPart1[1] == 0){ + // This is a numeric variable + extendedVariableMode = false; + // And as such, it cannot be an extension of a + // previous, long string variable. + OBSwiseTypelList.add(recordType2FixedPart1[1]); + variableCounter++; + isNumericVariable = true; + variableTypelList.add(recordType2FixedPart1[1]); + } else if (recordType2FixedPart1[1] > 0){ + + // This looks like a regular string variable. However, + // it may still be a part of a compound variable + // (a String > 255 bytes that was split into 255 byte + // chunks, stored as individual String variables). + + if (recordType2FixedPart1[1] == 255){ + obs255 = true; + } + + if ( lastVariableIsExtendable ) { + String varNameBase = null; + if ( lastVariableName.length() > 5 ) { + varNameBase = lastVariableName.substring (0, 5); + } else { + varNameBase = lastVariableName; + } + + if ( extendedVariableMode ) { + if ( variableNameIsAnIncrement ( varNameBase, lastExtendedVariable, variableName ) ) { + OBSwiseTypelList.add(-1); + lastExtendedVariable = variableName; + // OK, we stay in the "extended variable" mode; + // but we can't move on to the next OBS (hence the commented out + // "continue" below: + //continue; + // see the next comment below for the explanation. + // + // Should we also set "extendable" flag to false at this point + // if it's shorter than 255 bytes, i.e. the last extended chunk? + } else { + extendedVariableMode = false; + } + } else { + if ( variableNameIsAnIncrement ( varNameBase, variableName ) ) { + OBSwiseTypelList.add(-1); + extendedVariableMode = true; + dbgLog.fine("RT2: in extended variable mode; variable "+variableName); + lastExtendedVariable = variableName; + // Before we move on to the next OBS unit, we need to check + // if this current extended variable has its own label specified; + // If so, we need to determine its length, then read and skip + // that many bytes. + // Hence the commented out "continue" below: + //continue; + } + } + } + + if ( !extendedVariableMode) { + // OK, this is a "real" + // string variable, and not a continuation chunk of a compound + // string. + + OBSwiseTypelList.add(recordType2FixedPart1[1]); + variableCounter++; + + if (recordType2FixedPart1[1] == 255){ + // This variable is 255 bytes long, i.e. this is + // either the single "atomic" variable of the + // max allowed size, or it's a 255 byte segment + // of a compound variable. So we will check + // the next variable and see if it is the continuation + // of this one. + + lastVariableIsExtendable = true; + } else { + lastVariableIsExtendable = false; + } + + if (recordType2FixedPart1[1] % LENGTH_SAV_OBS_BLOCK == 0){ + HowManyRt2Units = recordType2FixedPart1[1] / LENGTH_SAV_OBS_BLOCK; + } else { + HowManyRt2Units = recordType2FixedPart1[1] / LENGTH_SAV_OBS_BLOCK +1; + } + variableTypelList.add(recordType2FixedPart1[1]); + } + } + + if ( !extendedVariableMode ) { + // Again, we only want to do the following steps for the "real" + // variables, not the chunks of split mega-variables: + + dbgLog.fine("RT2: HowManyRt2Units for this variable="+HowManyRt2Units); + + lastVariableName = variableName; + + // caseWeightVariableOBSIndex starts from 1: 0 is used for does-not-exist cases + if (j == (caseWeightVariableOBSIndex - 1)){ + caseWeightVariableName = variableName; + // TODO: do we need this "index"? -- 4.0 alpha + caseWeightVariableIndex = variableCounter; + + ///smd.setCaseWeightVariableName(caseWeightVariableName); + ///smd.getFileInformation().put("caseWeightVariableIndex", caseWeightVariableIndex); + } + + OBSIndexToVariableName.put(j, variableName); + + //dbgLog.fine("\nvariable name="+variableName+"<-"); + dbgLog.fine("RT2: "+j+"-th variable name="+variableName+"<-"); + dbgLog.fine("RT2: raw variable: "+RawVariableName); + + variableNameList.add(variableName); + } + + + + // 3rd ([2]) element: = 1 variable-label block follows; 0 = no label + // + dbgLog.fine("RT: variable label follows?(1:yes; 0: no)="+recordType2FixedPart1[2]); + boolean hasVariableLabel = recordType2FixedPart1[2] == 1 ? true : false; + if ((recordType2FixedPart1[2] != 0) && (recordType2FixedPart1[2] != 1)) { + throw new IOException("RT2: reading error: value is neither 0 or 1"+ + recordType2FixedPart1[2]); + } + + // 2.4 [optional]The length of a variable label followed: 4-byte int + // 3rd element of 2.1 indicates whether this field exists + // *** warning: The label block is padded to a multiple of the 4-byte + // NOT the raw integer value of this 4-byte block + + + if (hasVariableLabel){ + byte[] length_variable_label= new byte[4]; + int nbytes_2_4 = stream.read(length_variable_label); + if (nbytes_2_4 == 0){ + throw new IOException("RT 2: error reading recordType2.4: no bytes read!"); + } else { + dbgLog.fine("nbytes_2_4="+nbytes_2_4); + } + ByteBuffer bb_length_variable_label = ByteBuffer.wrap( + length_variable_label, 0, LENGTH_VARIABLE_LABEL); + if (isLittleEndian){ + bb_length_variable_label.order(ByteOrder.LITTLE_ENDIAN); + } + int rawVariableLabelLength = bb_length_variable_label.getInt(); + + dbgLog.fine("rawVariableLabelLength="+rawVariableLabelLength); + int variableLabelLength = getSAVintAdjustedBlockLength(rawVariableLabelLength); + dbgLog.fine("RT2: variableLabelLength="+variableLabelLength); + + // 2.5 [optional]variable label whose length is found at 2.4 + + String variableLabel = ""; + + if (rawVariableLabelLength > 0) { + byte[] variable_label = new byte[variableLabelLength]; + int nbytes_2_5 = stream.read(variable_label); + if (nbytes_2_5 == 0){ + throw new IOException("RT 2: error reading recordType2.5: " + +variableLabelLength+" bytes requested, no bytes read!"); + } else { + dbgLog.fine("nbytes_2_5="+nbytes_2_5); + } + variableLabel = new String(Arrays.copyOfRange(variable_label, + 0, rawVariableLabelLength),defaultCharSet); + dbgLog.fine("RT2: variableLabel="+variableLabel+"<-"); + + dbgLog.info(variableName + " => " + variableLabel); + } else { + dbgLog.fine("RT2: defaulting to empty variable label."); + } + + if (!extendedVariableMode) { + // We only have any use for this label if it's a "real" variable. + // Thinking about it, it doesn't make much sense for the "fake" + // variables that are actually chunks of large strings to store + // their own labels. But in some files they do. Then failing to read + // the bytes would result in getting out of sync with the RT record + // borders. So we always read the bytes, but only use them for + // the real variable entries. + /*String variableLabel = new String(Arrays.copyOfRange(variable_label, + 0, rawVariableLabelLength),"US-ASCII");*/ + + variableLabelMap.put(variableName, variableLabel); + } + } + + if (extendedVariableMode) { + // there's nothing else left for us to do in this iteration of the loop. + // Once again, this was not a real variable, but a dummy variable entry + // created for a chunk of a string variable longer than 255 bytes -- + // that's how SPSS stores them. + continue; + } + + // 4th ([3]) element: Missing value type code + // 0[none], 1, 2, 3 [point-type],-2[range], -3 [range type+ point] + + dbgLog.fine("RT: missing value unit follows?(if 0, none)="+recordType2FixedPart1[3]); + boolean hasMissingValues = + (validMissingValueCodeSet.contains( + recordType2FixedPart1[3]) && (recordType2FixedPart1[3] !=0)) ? + true : false; + + InvalidData invalidDataInfo = null; + + if (recordType2FixedPart1[3] !=0){ + invalidDataInfo = new InvalidData(recordType2FixedPart1[3]); + dbgLog.fine("RT: missing value type="+invalidDataInfo.getType()); + } + + // 2.2: print/write formats: 4-byte each = 8 bytes + + byte[] printFormt = Arrays.copyOfRange(recordType2Fixed, offset, offset+ + LENGTH_PRINT_FORMAT_CODE); + dbgLog.fine("printFrmt="+new String (Hex.encodeHex(printFormt))); + + + offset +=LENGTH_PRINT_FORMAT_CODE; + int formatCode = isLittleEndian ? printFormt[2] : printFormt[1]; + int formatWidth = isLittleEndian ? printFormt[1] : printFormt[2]; + + // TODO: + // What should we be doing with these "format decimal positions" + // in 4.0? + // -- L.A. 4.0 alpha + + int formatDecimalPointPosition = isLittleEndian ? printFormt[0] : printFormt[3]; + dbgLog.fine("RT2: format code{5=F, 1=A[String]}="+formatCode); + + formatDecimalPointPositionList.add(formatDecimalPointPosition); + + + if (!SPSSConstants.FORMAT_CODE_TABLE_SAV.containsKey(formatCode)){ + throw new IOException("Unknown format code was found = " + + formatCode); + } else{ + printFormatList.add(formatCode); + } + + byte[] writeFormt = Arrays.copyOfRange(recordType2Fixed, offset, offset+ + LENGTH_WRITE_FORMAT_CODE); + + dbgLog.fine("RT2: writeFrmt="+new String (Hex.encodeHex(writeFormt))); + if (writeFormt[3] != 0x00){ + dbgLog.fine("byte-order(write format): reversal required"); + } + + offset +=LENGTH_WRITE_FORMAT_CODE; + + if (!SPSSConstants.ORDINARY_FORMAT_CODE_SET.contains(formatCode)) { + StringBuilder sb = new StringBuilder( + SPSSConstants.FORMAT_CODE_TABLE_SAV.get(formatCode)+ + formatWidth); + if (formatDecimalPointPosition > 0){ + sb.append("."+ formatDecimalPointPosition); + } + dbgLog.info("formattable[i] = " + variableName + " -> " + sb.toString()); + printFormatNameTable.put(variableName, sb.toString()); + + } + + printFormatTable.put(variableName, SPSSConstants.FORMAT_CODE_TABLE_SAV.get(formatCode)); + + + // 2.6 [optional] missing values:4-byte each if exists + // 4th element of 2.1 indicates the structure of this sub-field + + // Should we perhaps check for this for the "fake" variables too? + // + + if (hasMissingValues) { + dbgLog.fine("RT2: decoding missing value: type="+recordType2FixedPart1[3]); + int howManyMissingValueUnits = missingValueCodeUnits.get(recordType2FixedPart1[3]); + //int howManyMissingValueUnits = recordType2FixedPart1[3] > 0 ? recordType2FixedPart1[3] : 0; + + dbgLog.fine("RT2: howManyMissingValueUnits="+howManyMissingValueUnits); + + byte[] missing_value_code_units = new byte[LENGTH_SAV_OBS_BLOCK*howManyMissingValueUnits]; + int nbytes_2_6 = stream.read(missing_value_code_units); + + if (nbytes_2_6 == 0){ + throw new IOException("RT 2: reading recordType2.6: no byte was read"); + } else { + dbgLog.fine("nbytes_2_6="+nbytes_2_6); + } + + //printHexDump(missing_value_code_units, "missing value"); + + if (isNumericVariable){ + + double[] missingValues = new double[howManyMissingValueUnits]; + //List mvp = new ArrayList(); + List mv = new ArrayList(); + + ByteBuffer[] bb_missig_value_code = + new ByteBuffer[howManyMissingValueUnits]; + + int offset_start = 0; + + for (int i= 0; i < howManyMissingValueUnits;i++ ){ + + bb_missig_value_code[i] = + ByteBuffer.wrap(missing_value_code_units, offset_start, + LENGTH_SAV_OBS_BLOCK); + + offset_start +=LENGTH_SAV_OBS_BLOCK; + if (isLittleEndian){ + bb_missig_value_code[i].order(ByteOrder.LITTLE_ENDIAN); + } + + ByteBuffer temp = bb_missig_value_code[i].duplicate(); + + + missingValues[i] = bb_missig_value_code[i].getDouble(); + if (Double.toHexString(missingValues[i]).equals("-0x1.ffffffffffffep1023")){ + dbgLog.fine("1st value is LOWEST"); + mv.add(Double.toHexString(missingValues[i])); + } else if (Double.valueOf(missingValues[i]).equals(Double.MAX_VALUE)){ + dbgLog.fine("2nd value is HIGHEST"); + mv.add(Double.toHexString(missingValues[i])); + } else { + mv.add(doubleNumberFormatter.format(missingValues[i])); + } + dbgLog.fine(i+"-th missing value="+Double.toHexString(missingValues[i])); + } + + dbgLog.fine("variableName="+variableName); + if (recordType2FixedPart1[3] > 0) { + // point cases only + dbgLog.fine("mv(>0)="+mv); + missingValueTable.put(variableName, mv); + invalidDataInfo.setInvalidValues(mv); + } else if (recordType2FixedPart1[3]== -2) { + dbgLog.fine("mv(-2)="+mv); + // range + invalidDataInfo.setInvalidRange(mv); + } else if (recordType2FixedPart1[3]== -3){ + // mixed case + dbgLog.fine("mv(-3)="+mv); + invalidDataInfo.setInvalidRange(mv.subList(0, 2)); + invalidDataInfo.setInvalidValues(mv.subList(2, 3)); + missingValueTable.put(variableName, mv.subList(2, 3)); + } + + dbgLog.fine("missing value="+ + StringUtils.join(missingValueTable.get(variableName),"|")); + dbgLog.fine("invalidDataInfo(Numeric):\n"+invalidDataInfo); + invalidDataTable.put(variableName, invalidDataInfo); + } else { + // string variable case + String[] missingValues = new String[howManyMissingValueUnits]; + List mv = new ArrayList(); + int offset_start = 0; + int offset_end = LENGTH_SAV_OBS_BLOCK; + for (int i= 0; i < howManyMissingValueUnits;i++ ){ + + missingValues[i] = + StringUtils.stripEnd(new + String(Arrays.copyOfRange(missing_value_code_units, offset_start, offset_end),defaultCharSet), " "); + dbgLog.fine("missing value="+missingValues[i]+"<-"); + + offset_start = offset_end; + offset_end +=LENGTH_SAV_OBS_BLOCK; + + mv.add(missingValues[i]); + } + invalidDataInfo.setInvalidValues(mv); + missingValueTable.put(variableName, mv); + invalidDataTable.put(variableName, invalidDataInfo); + dbgLog.fine("missing value(str)="+ + StringUtils.join(missingValueTable.get(variableName),"|")); + dbgLog.fine("invalidDataInfo(String):\n"+invalidDataInfo); + + } // string case + dbgLog.fine("invalidDataTable:\n"+invalidDataTable); + } // if msv + + } catch (IOException ex){ + //ex.printStackTrace(); + throw ex; + } catch (Exception ex){ + ex.printStackTrace(); + // should we be throwing some exception here? + } + } // j-loop + + if (j != OBSUnitsPerCase ) { + dbgLog.info("RT2: attention! didn't reach the end of the OBS list!"); + throw new IOException("RT2: didn't reach the end of the OBS list!"); + } + + dbgLog.fine("RT2 metadata-related exit-chores"); + ///smd.getFileInformation().put("varQnty", variableCounter); + dataTable.setVarQuantity(new Long(variableCounter)); + dbgLog.fine("RT2: varQnty=" + variableCounter); + + // 4.0 Initialize variables: + List variableList = new ArrayList(); + + for (int i = 0; i < variableCounter; i++) { + DataVariable dv = new DataVariable(); + String varName = variableNameList.get(i); + dv.setName(varName); + dv.setLabel(variableLabelMap.get(varName)); + dv.setFormatSchemaName(printFormatNameTable.get(varName)); + + dv.setInvalidRanges(new ArrayList()); + dv.setSummaryStatistics( new ArrayList() ); + dv.setUnf("UNF:6:NOTCALCULATED"); + dv.setCategories(new ArrayList()); + variableList.add(dv); + + dv.setFileOrder(i); + + dv.setDataTable(dataTable); + } + + dataTable.setDataVariables(variableList); + + ///smd.setVariableName(variableNameList.toArray(new String[variableNameList.size()])); + ///smd.setVariableLabel(variableLabelMap); + // TODO: + // figure out what to do with the missing value table! + // -- 4.0 alpha + // well, they were used to generate merged summary statistics for + // the variable. So need to verify what the DDI import was doing + // with them and replicate the same in 4.0. + // (add appropriate value labels?) + ///TODO: 4.0 smd.setMissingValueTable(missingValueTable); + ///smd.getFileInformation().put("caseWeightVariableName", caseWeightVariableName); + + dbgLog.fine("sumstat:long case=" + Arrays.deepToString(variableTypelList.toArray())); + + // 4.0 + // "printFoprmatList"/SMD VariableFormat - doesn't seem to be used + // anywhere in v. 3.* ! (TODO: double-check! -- 4.0 alpha) + ///smd.setVariableFormat(printFormatList); + // 4.0 + // "variableFormatName" is what ends up being in the "formatName" var + // attribute in the DDI; + // in the DataVariable object it corresponds to getFormatSchemaName(); + + ///smd.setVariableFormatName(printFormatNameTable); + + ///dbgLog.info("<<<<<<"); + ///dbgLog.info("printFormatList = " + printFormatList); + ///dbgLog.info("printFormatNameTable = " + printFormatNameTable); + // dbgLog.info("formatCategoryTable = " + formatCategoryTable); + ///dbgLog.info(">>>>>>"); + + dbgLog.fine("RT2: OBSwiseTypelList=" + OBSwiseTypelList); + + // variableType is determined after the valueTable is finalized + dbgLog.fine("decodeRecordType2(): end"); + } + + void decodeRecordType3and4(BufferedInputStream stream) throws IOException { + dbgLog.fine("decodeRecordType3and4(): start"); + Map> valueLabelTable + = new LinkedHashMap>(); + + int safteyCounter = 0; + while (true) { + try { + if (stream == null) { + throw new IllegalArgumentException("stream == null!"); + } + // this secton may not exit so first check the 4-byte header value + //if (stream.markSupported()){ + stream.mark(1000); + //} + // 3.0 check the first 4 bytes + byte[] headerCode = new byte[LENGTH_RECORD_TYPE3_CODE]; + + int nbytes_rt3 = stream.read(headerCode, 0, LENGTH_RECORD_TYPE3_CODE); + // to-do check against nbytes + //printHexDump(headerCode, "RT3 header test"); + ByteBuffer bb_header_code = ByteBuffer.wrap(headerCode, + 0, LENGTH_RECORD_TYPE3_CODE); + if (isLittleEndian) { + bb_header_code.order(ByteOrder.LITTLE_ENDIAN); + } + + int intRT3test = bb_header_code.getInt(); + dbgLog.fine("header test value: RT3=" + intRT3test); + if (intRT3test != 3) { + //if (stream.markSupported()){ + dbgLog.fine("iteration=" + safteyCounter); + + // We have encountered a record that's not type 3. This means we've + // processed all the type 3/4 record pairs. So we want to rewind + // the stream and return -- so that the appropriate record type + // reader can be called on it. + // But before we return, we need to save all the value labels + // we have found: + //smd.setValueLabelTable(valueLabelTable); + assignValueLabels(valueLabelTable); + + stream.reset(); + return; + //} + } + // 3.1 how many value-label pairs follow + byte[] number_of_labels = new byte[LENGTH_RT3_HOW_MANY_LABELS]; + + int nbytes_3_1 = stream.read(number_of_labels); + if (nbytes_3_1 == 0) { + throw new IOException("RT 3: reading recordType3.1: no byte was read"); + } + ByteBuffer bb_number_of_labels = ByteBuffer.wrap(number_of_labels, + 0, LENGTH_RT3_HOW_MANY_LABELS); + if (isLittleEndian) { + bb_number_of_labels.order(ByteOrder.LITTLE_ENDIAN); + } + + int numberOfValueLabels = bb_number_of_labels.getInt(); + dbgLog.fine("number of value-label pairs=" + numberOfValueLabels); + + ByteBuffer[] tempBB = new ByteBuffer[numberOfValueLabels]; + + String valueLabel[] = new String[numberOfValueLabels]; + + for (int i = 0; i < numberOfValueLabels; i++) { + + // read 8-byte as value + byte[] value = new byte[LENGTH_RT3_VALUE]; + int nbytes_3_value = stream.read(value); + + if (nbytes_3_value == 0) { + throw new IOException("RT 3: reading recordType3 value: no byte was read"); + } + // note these 8 bytes are interpreted later + // currently no information about which variable's (=> type unknown) + ByteBuffer bb_value = ByteBuffer.wrap(value, + 0, LENGTH_RT3_VALUE); + if (isLittleEndian) { + bb_value.order(ByteOrder.LITTLE_ENDIAN); + } + tempBB[i] = bb_value; + dbgLog.fine("bb_value=" + Hex.encodeHex(bb_value.array())); + /* + double valueD = bb_value.getDouble(); + dbgLog.fine("value="+valueD); + */ + // read 1st byte as unsigned integer = label_length + + // read label_length byte as label + byte[] labelLengthByte = new byte[LENGTH_RT3_LABEL_LENGTH]; + + int nbytes_3_label_length = stream.read(labelLengthByte); + + // add check-routine here + dbgLog.fine("labelLengthByte" + Hex.encodeHex(labelLengthByte)); + dbgLog.fine("label length = " + labelLengthByte[0]); + // the net-length of a value label is saved as + // unsigned byte; however, the length is less than 127 + // byte should be ok + int rawLabelLength = labelLengthByte[0] & 0xFF; + dbgLog.fine("rawLabelLength=" + rawLabelLength); + // -1 =>1-byte already read + int labelLength = getSAVobsAdjustedBlockLength(rawLabelLength + 1) - 1; + byte[] valueLabelBytes = new byte[labelLength]; + int nbytes_3_value_label = stream.read(valueLabelBytes); + + // ByteBuffer bb_label = ByteBuffer.wrap(valueLabel,0,labelLength); + valueLabel[i] = StringUtils.stripEnd(new String(Arrays.copyOfRange(valueLabelBytes, 0, rawLabelLength), defaultCharSet), " "); + dbgLog.fine(i + "-th valueLabel=" + valueLabel[i] + "<-"); + + } // iter rt3 + + dbgLog.fine("end of RT3 block"); + dbgLog.fine("start of RT4 block"); + + // 4.0 check the first 4 bytes + byte[] headerCode4 = new byte[LENGTH_RECORD_TYPE4_CODE]; + + int nbytes_rt4 = stream.read(headerCode4, 0, LENGTH_RECORD_TYPE4_CODE); + + if (nbytes_rt4 == 0) { + throw new IOException("RT4: reading recordType4 value: no byte was read"); + } + + //printHexDump(headerCode4, "RT4 header test"); + ByteBuffer bb_header_code_4 = ByteBuffer.wrap(headerCode4, + 0, LENGTH_RECORD_TYPE4_CODE); + if (isLittleEndian) { + bb_header_code_4.order(ByteOrder.LITTLE_ENDIAN); + } + + int intRT4test = bb_header_code_4.getInt(); + dbgLog.fine("header test value: RT4=" + intRT4test); + + if (intRT4test != 4) { + throw new IOException("RT 4: reading recordType4 header: no byte was read"); + } + + // 4.1 read the how-many-variables bytes + byte[] howManyVariablesfollow = new byte[LENGTH_RT4_HOW_MANY_VARIABLES]; + + int nbytes_rt4_1 = stream.read(howManyVariablesfollow, 0, LENGTH_RT4_HOW_MANY_VARIABLES); + + ByteBuffer bb_howManyVariablesfollow = ByteBuffer.wrap(howManyVariablesfollow, + 0, LENGTH_RT4_HOW_MANY_VARIABLES); + if (isLittleEndian) { + bb_howManyVariablesfollow.order(ByteOrder.LITTLE_ENDIAN); + } + + int howManyVariablesRT4 = bb_howManyVariablesfollow.getInt(); + dbgLog.fine("how many variables follow: RT4=" + howManyVariablesRT4); + + int length_indicies = LENGTH_RT4_VARIABLE_INDEX * howManyVariablesRT4; + byte[] variableIdicesBytes = new byte[length_indicies]; + + int nbytes_rt4_2 = stream.read(variableIdicesBytes, 0, length_indicies); + + // !!!!! Caution: variableIndex in RT4 starts from 1 NOT ** 0 ** + int[] variableIndex = new int[howManyVariablesRT4]; + int offset = 0; + for (int i = 0; i < howManyVariablesRT4; i++) { + + ByteBuffer bb_variable_index = ByteBuffer.wrap(variableIdicesBytes, + offset, LENGTH_RT4_VARIABLE_INDEX); + offset += LENGTH_RT4_VARIABLE_INDEX; + + if (isLittleEndian) { + bb_variable_index.order(ByteOrder.LITTLE_ENDIAN); + } + + variableIndex[i] = bb_variable_index.getInt(); + dbgLog.fine(i + "-th variable index number=" + variableIndex[i]); + } + + dbgLog.fine("variable index set=" + ArrayUtils.toString(variableIndex)); + dbgLog.fine("subtract 1 from variableIndex for getting a variable info"); + + boolean isNumeric = OBSwiseTypelList.get(variableIndex[0] - 1) == 0 ? true : false; + + Map valueLabelPair = new LinkedHashMap(); + if (isNumeric) { + // numeric variable + dbgLog.fine("processing of a numeric value-label table"); + for (int j = 0; j < numberOfValueLabels; j++) { + valueLabelPair.put(doubleNumberFormatter.format(tempBB[j].getDouble()), valueLabel[j]); + } + } else { + // String variable + dbgLog.fine("processing of a string value-label table"); + for (int j = 0; j < numberOfValueLabels; j++) { + valueLabelPair.put( + StringUtils.stripEnd(new String((tempBB[j].array()), defaultCharSet), " "), valueLabel[j]); + } + } + + dbgLog.fine("valueLabePair=" + valueLabelPair); + dbgLog.fine("key variable's (raw) index =" + variableIndex[0]); + + valueLabelTable.put(OBSIndexToVariableName.get(variableIndex[0] - 1), valueLabelPair); + + dbgLog.fine("valueLabelTable=" + valueLabelTable); + + // create a mapping table that finds the key variable for this mapping table + String keyVariableName = OBSIndexToVariableName.get(variableIndex[0] - 1); + for (int vn : variableIndex) { + valueVariableMappingTable.put(OBSIndexToVariableName.get(vn - 1), keyVariableName); + } + + dbgLog.fine("valueVariableMappingTable:\n" + valueVariableMappingTable); + } catch (IOException ex) { + //ex.printStackTrace(); + throw ex; + } + + safteyCounter++; + if (safteyCounter >= 1000000) { + break; + } + } //while + + ///smd.setValueLabelTable(valueLabelTable); + assignValueLabels(valueLabelTable); + + dbgLog.fine("***** decodeRecordType3and4(): end *****"); + } + + void assignValueLabels(Map> valueLabelTable) { + // Let's go through all the categorical value label mappings and + // assign them to the correct variables: + + for (int i = 0; i < dataTable.getVarQuantity().intValue(); i++) { + + String varName = dataTable.getDataVariables().get(i).getName(); + + Map valueLabelPairs = valueLabelTable.get(varName); + if (valueLabelPairs != null && !valueLabelPairs.isEmpty()) { + for (String value : valueLabelPairs.keySet()) { + + VariableCategory cat = new VariableCategory(); + cat.setValue(value); + cat.setLabel(valueLabelPairs.get(value)); + + /* cross-link the variable and category to each other: */ + cat.setDataVariable(dataTable.getDataVariables().get(i)); + dataTable.getDataVariables().get(i).getCategories().add(cat); + } + } + } + } + + + void decodeRecordType6(BufferedInputStream stream) throws IOException { + dbgLog.fine("***** decodeRecordType6(): start *****"); + try { + if (stream ==null){ + throw new IllegalArgumentException("stream == null!"); + } + // this section is optional; so let's first check the 4-byte header + // value and see what type it is. + //if (stream.markSupported()){ // -- ? L.A. 4.0 alpha + stream.mark(1000); + //} + // 6.0 check the first 4 bytes + byte[] headerCodeRt6 = new byte[LENGTH_RECORD_TYPE6_CODE]; + + int nbytes_rt6 = stream.read(headerCodeRt6, 0, LENGTH_RECORD_TYPE6_CODE); + // to-do check against nbytes + //printHexDump(headerCodeRt6, "RT6 header test"); + ByteBuffer bb_header_code_rt6 = ByteBuffer.wrap(headerCodeRt6, + 0, LENGTH_RECORD_TYPE6_CODE); + if (isLittleEndian){ + bb_header_code_rt6.order(ByteOrder.LITTLE_ENDIAN); + } + + int intRT6test = bb_header_code_rt6.getInt(); + dbgLog.fine("RT6: header test value="+intRT6test); + if (intRT6test != 6){ + //if (stream.markSupported()){ + //out.print("iteration="+safteyCounter); + //dbgLog.fine("iteration="+safteyCounter); + dbgLog.fine("intRT6test failed="+intRT6test); + + stream.reset(); + return; + //} + } + // 6.1 check 4-byte integer that tells how many lines follow + + byte[] length_how_many_line_bytes = new byte[LENGTH_RT6_HOW_MANY_LINES]; + + int nbytes_rt6_1 = stream.read(length_how_many_line_bytes, 0, + LENGTH_RT6_HOW_MANY_LINES); + // to-do check against nbytes + + //printHexDump(length_how_many_line_bytes, "RT6 how_many_line_bytes"); + ByteBuffer bb_how_many_lines = ByteBuffer.wrap(length_how_many_line_bytes, + 0, LENGTH_RT6_HOW_MANY_LINES); + if (isLittleEndian){ + bb_how_many_lines.order(ByteOrder.LITTLE_ENDIAN); + } + + int howManyLinesRt6 = bb_how_many_lines.getInt(); + dbgLog.fine("how Many lines follow="+howManyLinesRt6); + + // 6.2 read 80-char-long lines + String[] documentRecord = new String[howManyLinesRt6]; + + for (int i=0;i releaseMachineSpecificInfo = new ArrayList(); + /// List releaseMachineSpecificInfoHex = new ArrayList(); + + /// // Subytpe 4 + /// Map OBSTypeValue = new LinkedHashMap(); + /// Map OBSTypeHexValue = new LinkedHashMap(); + //Subtype 11 + /// List measurementLevel = new ArrayList(); + /// List columnWidth = new ArrayList(); + /// List alignment = new ArrayList(); + + + + + while(true){ + try { + if (stream ==null){ + throw new IllegalArgumentException("RT7: stream == null!"); + } + // first check the 4-byte header value + //if (stream.markSupported()){ + stream.mark(1000); + //} + // 7.0 check the first 4 bytes + byte[] headerCodeRt7 = new byte[LENGTH_RECORD_TYPE7_CODE]; + + int nbytes_rt7 = stream.read(headerCodeRt7, 0, + LENGTH_RECORD_TYPE7_CODE); + // to-do check against nbytes + //printHexDump(headerCodeRt7, "RT7 header test"); + ByteBuffer bb_header_code_rt7 = ByteBuffer.wrap(headerCodeRt7, + 0, LENGTH_RECORD_TYPE7_CODE); + if (isLittleEndian){ + bb_header_code_rt7.order(ByteOrder.LITTLE_ENDIAN); + } + + int intRT7test = bb_header_code_rt7.getInt(); + dbgLog.fine("RT7: header test value="+intRT7test); + if (intRT7test != 7){ + //if (stream.markSupported()){ + //out.print("iteration="+safteyCounter); + //dbgLog.fine("iteration="+safteyCounter); + dbgLog.fine("intRT7test failed="+intRT7test); + dbgLog.fine("counter="+counter); + stream.reset(); + return; + //} + } + + // 7.1 check 4-byte integer Sub-Type Code + + byte[] length_sub_type_code = new byte[LENGTH_RT7_SUB_TYPE_CODE]; + + int nbytes_rt7_1 = stream.read(length_sub_type_code, 0, + LENGTH_RT7_SUB_TYPE_CODE); + // to-do check against nbytes + + //printHexDump(length_how_many_line_bytes, "RT7 how_many_line_bytes"); + ByteBuffer bb_sub_type_code = ByteBuffer.wrap(length_sub_type_code, + 0, LENGTH_RT7_SUB_TYPE_CODE); + if (isLittleEndian){ + bb_sub_type_code.order(ByteOrder.LITTLE_ENDIAN); + } + + int subTypeCode = bb_sub_type_code.getInt(); + dbgLog.fine("RT7: subTypeCode="+subTypeCode); + + + switch (subTypeCode) { + case 3: + // 3: Release andMachine-Specific Integer Information + + //parseRT7SubTypefield(stream); + + + headerSection = parseRT7SubTypefieldHeader(stream); + if (headerSection != null){ + int unitLength = headerSection[0]; + int numberOfUnits = headerSection[1]; + + + for (int i=0; i 20){ + break; + } + } + + dbgLog.fine("RT7: counter="+counter); + dbgLog.fine("RT7: decodeRecordType7(): end"); + } + + + void decodeRecordType999(BufferedInputStream stream) throws IOException { + dbgLog.fine("decodeRecordType999(): start"); + try { + if (stream ==null){ + throw new IllegalArgumentException("RT999: stream == null!"); + } + // first check the 4-byte header value + //if (stream.markSupported()){ + stream.mark(1000); + //} + // 999.0 check the first 4 bytes + byte[] headerCodeRt999 = new byte[LENGTH_RECORD_TYPE999_CODE]; + + //dbgLog.fine("RT999: stream position="+stream.pos); + + int nbytes_rt999 = stream.read(headerCodeRt999, 0, + LENGTH_RECORD_TYPE999_CODE); + // to-do check against nbytes + //printHexDump(headerCodeRt999, "RT999 header test"); + ByteBuffer bb_header_code_rt999 = ByteBuffer.wrap(headerCodeRt999, + 0, LENGTH_RECORD_TYPE999_CODE); + if (isLittleEndian){ + bb_header_code_rt999.order(ByteOrder.LITTLE_ENDIAN); + } + + int intRT999test = bb_header_code_rt999.getInt(); + dbgLog.fine("header test value: RT999="+intRT999test); + if (intRT999test != 999){ + //if (stream.markSupported()){ + dbgLog.fine("intRT999test failed="+intRT999test); + stream.reset(); + throw new IOException("RT999:Header value(999) was not correctly detected:"+intRT999test); + //} + } + + + + // 999.1 check 4-byte integer Filler block + + byte[] length_filler = new byte[LENGTH_RT999_FILLER]; + + int nbytes_rt999_1 = stream.read(length_filler, 0, + LENGTH_RT999_FILLER); + // to-do check against nbytes + + //printHexDump(length_how_many_line_bytes, "RT999 how_many_line_bytes"); + ByteBuffer bb_filler = ByteBuffer.wrap(length_filler, + 0, LENGTH_RT999_FILLER); + if (isLittleEndian){ + bb_filler.order(ByteOrder.LITTLE_ENDIAN); + } + + int rt999filler = bb_filler.getInt(); + dbgLog.fine("rt999filler="+rt999filler); + + if (rt999filler == 0){ + dbgLog.fine("the end of the dictionary section"); + } else { + throw new IOException("RT999: failed to detect the end mark(0): value="+rt999filler); + } + + // missing value processing concerning HIGHEST/LOWEST values + + Set> msvlc = invalidDataTable.entrySet(); + for (Iterator> itc = msvlc.iterator(); itc.hasNext();){ + Map.Entry et = itc.next(); + String variable = et.getKey(); + dbgLog.fine("variable="+variable); + InvalidData invalidDataInfo = et.getValue(); + + if (invalidDataInfo.getInvalidRange() != null && + !invalidDataInfo.getInvalidRange().isEmpty()){ + if (invalidDataInfo.getInvalidRange().get(0).equals(OBSTypeHexValue.get("LOWEST"))){ + dbgLog.fine("1st value is LOWEST"); + invalidDataInfo.getInvalidRange().set(0, "LOWEST"); + } else if (invalidDataInfo.getInvalidRange().get(1).equals(OBSTypeHexValue.get("HIGHEST"))){ + dbgLog.fine("2nd value is HIGHEST"); + invalidDataInfo.getInvalidRange().set(1,"HIGHEST"); + } + } + } + dbgLog.fine("invalidDataTable:\n"+invalidDataTable); + // TODO: take care of the invalid data! - add the appropriate + // value labels (?) + // should it be done here, or at the end of ingest? + // -- L.A. 4.0 alpha + ///smd.setInvalidDataTable(invalidDataTable); + } catch (IOException ex){ + //ex.printStackTrace(); + //exit(1); + throw ex; + } + + dbgLog.fine("decodeRecordType999(): end"); + } + + + + void decodeRecordTypeData(BufferedInputStream stream) throws IOException { + dbgLog.fine("decodeRecordTypeData(): start"); + + ///String fileUnfValue = null; + ///String[] unfValues = null; + + + + if (stream ==null){ + throw new IllegalArgumentException("stream == null!"); + } + if (isDataSectionCompressed){ + decodeRecordTypeDataCompressed(stream); + } else { + decodeRecordTypeDataUnCompressed(stream); + } + + /* UNF calculation was here... */ + + dbgLog.fine("***** decodeRecordTypeData(): end *****"); + } + + PrintWriter createOutputWriter (BufferedInputStream stream) throws IOException { + PrintWriter pwout = null; + FileOutputStream fileOutTab = null; + + try { + + // create a File object to save the tab-delimited data file + File tabDelimitedDataFile = File.createTempFile("tempTabfile.", ".tab"); + + String tabDelimitedDataFileName = tabDelimitedDataFile.getAbsolutePath(); + + // save the temp file name in the metadata object + ///smd.getFileInformation().put("tabDelimitedDataFileLocation", tabDelimitedDataFileName); + ingesteddata.setTabDelimitedFile(tabDelimitedDataFile); + + fileOutTab = new FileOutputStream(tabDelimitedDataFile); + + pwout = new PrintWriter(new OutputStreamWriter(fileOutTab, "utf8"), true); + + } catch (FileNotFoundException ex) { + ex.printStackTrace(); + } catch (UnsupportedEncodingException ex) { + ex.printStackTrace(); + } catch (IOException ex){ + //ex.printStackTrace(); + throw ex; + } + + return pwout; + + } + + void decodeRecordTypeDataCompressed(BufferedInputStream stream) throws IOException { + + dbgLog.fine("***** decodeRecordTypeDataCompressed(): start *****"); + + if (stream == null) { + throw new IllegalArgumentException("decodeRecordTypeDataCompressed: stream == null!"); + } + + PrintWriter pwout = createOutputWriter(stream); + + int varQnty = dataTable.getVarQuantity().intValue(); + int caseQnty = dataTable.getCaseQuantity().intValue(); + + dbgLog.fine("varQnty: " + varQnty); + + + boolean hasStringVarContinuousBlock = + obsNonVariableBlockSet.size() > 0 ? true : false; + dbgLog.fine("hasStringVarContinuousBlock=" + hasStringVarContinuousBlock); + + int ii = 0; + + int OBS = LENGTH_SAV_OBS_BLOCK; + int nOBS = OBSUnitsPerCase; + + dbgLog.fine("OBSUnitsPerCase=" + OBSUnitsPerCase); + + int caseIndex = 0; + + dbgLog.fine("printFormatTable:\n" + printFormatTable); + variableFormatTypeList = new String[varQnty]; + + + + for (int i = 0; i < varQnty; i++) { + variableFormatTypeList[i] = SPSSConstants.FORMAT_CATEGORY_TABLE.get( + printFormatTable.get(variableNameList.get(i))); + dbgLog.fine("i=" + i + "th variableFormatTypeList=" + variableFormatTypeList[i]); + formatCategoryTable.put(variableNameList.get(i), variableFormatTypeList[i]); + } + dbgLog.fine("variableFormatType:\n" + Arrays.deepToString(variableFormatTypeList)); + dbgLog.fine("formatCategoryTable:\n" + formatCategoryTable); + + // TODO: + // Make sure the date formats are actually preserved! + // (this is something that was collected in the code below and passed + // to the UNF calculator). + // -- L.A. 4.0 alpha + List casewiseRecordForTabFile = new ArrayList(); + + try { + // this compression is applied only to non-float data, i.e. integer; + // 8-byte float datum is kept in tact + boolean hasReachedEOF = false; + + OBSERVATION: + while (true) { + + dbgLog.fine("SAV Reader: compressed: ii=" + ii + "-th iteration"); + + byte[] octate = new byte[LENGTH_SAV_OBS_BLOCK]; + + int nbytes = stream.read(octate); + + // processCompressedOBSblock () + + // (this means process a block of 8 compressed OBS + // values -- should result in 64 bytes of data total) + + for (int i = 0; i < LENGTH_SAV_OBS_BLOCK; i++) { + + + dbgLog.finer("i=" + i + "-th iteration"); + int octate_i = octate[i]; + //dbgLog.fine("octate="+octate_i); + if (octate_i < 0) { + octate_i += 256; + } + int byteCode = octate_i;//octate_i & 0xF; + //out.println("byeCode="+byteCode); + + // processCompressedOBS + + switch (byteCode) { + case 252: + // end of the file + dbgLog.fine("SAV Reader: compressed: end of file mark [FC] was found"); + hasReachedEOF = true; + break; + case 253: + // FD: uncompressed data follows after this octate + // long string datum or float datum + // read the following octate + byte[] uncompressedByte = new byte[LENGTH_SAV_OBS_BLOCK]; + int ucbytes = stream.read(uncompressedByte); + int typeIndex = (ii * OBS + i) % nOBS; + + if ((OBSwiseTypelList.get(typeIndex) > 0) || + (OBSwiseTypelList.get(typeIndex) == -1)) { + // code= >0 |-1: string or its conitiguous block + // decode as a string object + String strdatum = new String( + Arrays.copyOfRange(uncompressedByte, + 0, LENGTH_SAV_OBS_BLOCK), defaultCharSet); + //out.println("str_datum="+strdatum+"<-"); + // add this non-missing-value string datum + casewiseRecordForTabFile.add(strdatum); + //out.println("casewiseRecordForTabFile(String)="+casewiseRecordForTabFile); + } else if (OBSwiseTypelList.get(typeIndex) == -2) { + String strdatum = new String( + Arrays.copyOfRange(uncompressedByte, + 0, LENGTH_SAV_OBS_BLOCK - 1), defaultCharSet); + casewiseRecordForTabFile.add(strdatum); + //out.println("casewiseRecordForTabFile(String)="+casewiseRecordForTabFile); + } else if (OBSwiseTypelList.get(typeIndex) == 0) { + // code= 0: numeric + + ByteBuffer bb_double = ByteBuffer.wrap( + uncompressedByte, 0, LENGTH_SAV_OBS_BLOCK); + if (isLittleEndian) { + bb_double.order(ByteOrder.LITTLE_ENDIAN); + } + + Double ddatum = bb_double.getDouble(); + // out.println("ddatum="+ddatum); + // add this non-missing-value numeric datum + casewiseRecordForTabFile.add(doubleNumberFormatter.format(ddatum)); + dbgLog.fine("SAV Reader: compressed: added value to dataLine: " + ddatum); + + } else { + dbgLog.fine("SAV Reader: out-of-range exception"); + throw new IOException("out-of-range value was found"); + } + + /* + // EOF-check after reading this octate + if (stream.available() == 0){ + hasReachedEOF = true; + dbgLog.fine( + "SAV Reader: *** After reading an uncompressed octate," + + " reached the end of the file at "+ii + +"th iteration and i="+i+"th octate position [0-start] *****"); + } + */ + + + break; + case 254: + // FE: used as the missing value for string variables + // an empty case in a string variable also takes this value + // string variable does not accept space-only data + // cf: uncompressed case + // 20 20 20 20 20 20 20 20 + // add the string missing value + // out.println("254: String missing data"); + + casewiseRecordForTabFile.add(" "); // add "." here? + + + // Note that technically this byte flag (254/xFE) means + // that *eight* white space characters should be + // written to the output stream. This caused me + // a great amount of confusion, because it appeared + // to me that there was a mismatch between the number + // of bytes advertised in the variable metadata and + // the number of bytes actually found in the data + // section of a compressed SAV file; this is because + // these 8 bytes "come out of nowhere"; they are not + // written in the data section, but this flag specifies + // that they should be added to the output. + // Also, as I pointed out above, we are only writing + // out one whitespace character, not 8 as instructed. + // This appears to be legit; these blocks of 8 spaces + // seem to be only used for padding, and all such + // multiple padding spaces are stripped anyway during + // the post-processing. + + + break; + case 255: + // FF: system missing value for numeric variables + // cf: uncompressed case (sysmis) + // FF FF FF FF FF FF eF FF(little endian) + // add the numeric missing value + dbgLog.fine("SAV Reader: compressed: Missing Value, numeric"); + casewiseRecordForTabFile.add(MissingValueForTextDataFileNumeric); + + break; + case 0: + // 00: do nothing + dbgLog.fine("SAV Reader: compressed: doing nothing (zero); "); + + break; + default: + //out.println("byte code(default)="+ byteCode); + if ((byteCode > 0) && (byteCode < 252)) { + // datum is compressed + //Integer unCompressed = Integer.valueOf(byteCode -100); + // add this uncompressed numeric datum + Double unCompressed = Double.valueOf(byteCode - 100); + dbgLog.fine("SAV Reader: compressed: default case: " + unCompressed); + + casewiseRecordForTabFile.add(doubleNumberFormatter.format(unCompressed)); + // out.println("uncompressed="+unCompressed); + // out.println("dataline="+casewiseRecordForTabFile); + } + }// end of switch + + // out.println("end of switch"); + + + // The-end-of-a-case(row)-processing + + // this line that follows, and the code around it + // is really confusing: + int varCounter = (ii * OBS + i + 1) % nOBS; + // while both OBS and LENGTH_SAV_OBS_BLOCK = 8 + // (OBS was initialized as OBS=LENGTH_SAV_OBS_BLOCK), + // the 2 values mean different things: + // LENGTH_SAV_OBS_BLOCK is the number of bytes in one OBS; + // and OBS is the number of OBS blocks that we process + // at a time. I.e., we process 8 chunks of 8 bytes at a time. + // This is how data is organized inside an SAV file: + // 8 bytes of compression flags, followd by 8x8 or fewer + // (depending on the flags) bytes of compressed data. + // I should rename this OBS variable something more + // meaningful. + // + // Also, the "varCounter" variable name is entirely + // misleading -- it counts not variables, but OBS blocks. + + dbgLog.fine("SAV Reader: compressed: OBS counter=" + varCounter + "(ii=" + ii + ")"); + + if ((ii * OBS + i + 1) % nOBS == 0) { + + //out.println("casewiseRecordForTabFile(before)="+casewiseRecordForTabFile); + + // out.println("all variables in a case are parsed == nOBS"); + // out.println("hasStringVarContinuousBlock="+hasStringVarContinuousBlock); + + // check whether a string-variable's continuous block exits + // if so, they must be joined + + if (hasStringVarContinuousBlock) { + + // string-variable's continuous-block-concatenating-processing + + //out.println("concatenating process starts"); + //out.println("casewiseRecordForTabFile(before)="+casewiseRecordForTabFile); + //out.println("casewiseRecordForTabFile(before:size)="+casewiseRecordForTabFile.size()); + + StringBuilder sb = new StringBuilder(""); + int firstPosition = 0; + + Set removeJset = new HashSet(); + for (int j = 0; j < nOBS; j++) { + dbgLog.fine("RTD: j=" + j + "-th type =" + OBSwiseTypelList.get(j)); + if ((OBSwiseTypelList.get(j) == -1) || + (OBSwiseTypelList.get(j) == -2)) { + // Continued String variable found at j-th + // position. look back the j-1 + firstPosition = j - 1; + int lastJ = j; + String concatenated = null; + + removeJset.add(j); + sb.append(casewiseRecordForTabFile.get(j - 1)); + sb.append(casewiseRecordForTabFile.get(j)); + + for (int jc = 1; ; jc++) { + if ((j + jc == nOBS) + || ((OBSwiseTypelList.get(j + jc) != -1) + && (OBSwiseTypelList.get(j + jc) != -2))) { + + // j is the end unit of this string variable + concatenated = sb.toString(); + sb.setLength(0); + lastJ = j + jc; + break; + } else { + sb.append(casewiseRecordForTabFile.get(j + jc)); + removeJset.add(j + jc); + } + } + casewiseRecordForTabFile.set(j - 1, concatenated); + + //out.println(j-1+"th concatenated="+concatenated); + j = lastJ - 1; + + } // end-of-if: continuous-OBS only + + } // end of loop-j + + //out.println("removeJset="+removeJset); + + // a new list that stores a new case with concatanated string data + List newDataLine = new ArrayList(); + + for (int jl = 0; jl < casewiseRecordForTabFile.size(); jl++) { + //out.println("jl="+jl+"-th datum =["+casewiseRecordForTabFile.get(jl)+"]"); + + if (!removeJset.contains(jl)) { + +// if (casewiseRecordForTabFile.get(jl).equals(MissingValueForTextDataFileString)){ +// out.println("NA-S jl= "+jl+"=["+casewiseRecordForTabFile.get(jl)+"]"); +// } else if (casewiseRecordForTabFile.get(jl).equals(MissingValueForTextDataFileNumeric)){ +// out.println("NA-N jl= "+jl+"=["+casewiseRecordForTabFile.get(jl)+"]"); +// } else if (casewiseRecordForTabFile.get(jl)==null){ +// out.println("null case jl="+jl+"=["+casewiseRecordForTabFile.get(jl)+"]"); +// } else if (casewiseRecordForTabFile.get(jl).equals("NaN")){ +// out.println("NaN jl= "+jl+"=["+casewiseRecordForTabFile.get(jl)+"]"); +// } else if (casewiseRecordForTabFile.get(jl).equals("")){ +// out.println("blank jl= "+jl+"=["+casewiseRecordForTabFile.get(jl)+"]"); +// } else if (casewiseRecordForTabFile.get(jl).equals(" ")){ +// out.println("space jl= "+jl+"=["+casewiseRecordForTabFile.get(jl)+"]"); +// } + + newDataLine.add(casewiseRecordForTabFile.get(jl)); + } else { +// out.println("Excluded: jl="+jl+"-th datum=["+casewiseRecordForTabFile.get(jl)+"]"); + } + } // end of loop-jl + + //out.println("new casewiseRecordForTabFile="+newDataLine); + //out.println("new casewiseRecordForTabFile(size)="+newDataLine.size()); + + casewiseRecordForTabFile = newDataLine; + + } // end-if: stringContinuousVar-exist case + + // caseIndex starts from 1 not 0 + caseIndex = (ii * OBS + i + 1) / nOBS; + + for (int k = 0; k < casewiseRecordForTabFile.size(); k++) { + + dbgLog.fine("k=" + k + "-th variableTypelList=" + variableTypelList.get(k)); + + if (variableTypelList.get(k) > 0) { + + // Strip the String variables off the + // whitespace padding: + + // [ snipped ] + + // I've removed the block of code above where + // String values were substring()-ed to the + // length specified in the variable metadata; + // Doing that was not enough, since a string + // can still be space-padded inside its + // advertised capacity. (note that extended + // variables can have many kylobytes of such + // padding in them!) Plus it was completely + // redundant, since we are stripping all the + // trailing white spaces with + // StringUtils.stripEnd() below: + + + String paddRemoved = StringUtils.stripEnd(casewiseRecordForTabFile.get(k).toString(), null); + // TODO: clean this up. For now, just make sure that strings contain at least one blank space. + if (paddRemoved.equals("")) { + paddRemoved = " "; + } + casewiseRecordForTabFile.set(k, "\"" + paddRemoved.replaceAll("\"", Matcher.quoteReplacement("\\\"")) + "\""); + + // end of String var case + + } // end of variable-type check + + if (casewiseRecordForTabFile.get(k) != null && !casewiseRecordForTabFile.get(k).equals(MissingValueForTextDataFileNumeric)) { + + String variableFormatType = variableFormatTypeList[k]; + dbgLog.finer("k=" + k + "th printFormatTable format=" + printFormatTable.get(variableNameList.get(k))); + + int formatDecimalPointPosition = formatDecimalPointPositionList.get(k); + + + if (variableFormatType.equals("date")) { + dbgLog.finer("date case"); + + long dateDatum = Long.parseLong(casewiseRecordForTabFile.get(k).toString()) * 1000L - SPSS_DATE_OFFSET; + + String newDatum = sdf_ymd.format(new Date(dateDatum)); + dbgLog.finer("k=" + k + ":" + newDatum); + /* saving date format */ + dbgLog.finer("setting caseWiseDateFormatForUNF[k] = " + sdf_ymd.toPattern()); + casewiseRecordForTabFile.set(k, newDatum); + //formatCategoryTable.put(variableNameList.get(k), "date"); + } else if (variableFormatType.equals("time")) { + dbgLog.finer("time case:DTIME or DATETIME or TIME"); + //formatCategoryTable.put(variableNameList.get(k), "time"); + + if (printFormatTable.get(variableNameList.get(k)).equals("DTIME")) { + + if (casewiseRecordForTabFile.get(k).toString().indexOf(".") < 0) { + long dateDatum = Long.parseLong(casewiseRecordForTabFile.get(k).toString()) * 1000L - SPSS_DATE_BIAS; + String newDatum = sdf_dhms.format(new Date(dateDatum)); + dbgLog.finer("k=" + k + ":" + newDatum); + casewiseRecordForTabFile.set(k, newDatum); + } else { + // decimal point included + String[] timeData = casewiseRecordForTabFile.get(k).toString().split("\\."); + + dbgLog.finer(StringUtils.join(timeData, "|")); + long dateDatum = Long.parseLong(timeData[0]) * 1000L - SPSS_DATE_BIAS; + StringBuilder sb_time = new StringBuilder( + sdf_dhms.format(new Date(dateDatum))); + dbgLog.finer(sb_time.toString()); + + if (formatDecimalPointPosition > 0) { + sb_time.append("." + timeData[1].substring(0, formatDecimalPointPosition)); + } + + dbgLog.finer("k=" + k + ":" + sb_time.toString()); + casewiseRecordForTabFile.set(k, sb_time.toString()); + } + } else if (printFormatTable.get(variableNameList.get(k)).equals("DATETIME")) { + + if (casewiseRecordForTabFile.get(k).toString().indexOf(".") < 0) { + long dateDatum = Long.parseLong(casewiseRecordForTabFile.get(k).toString()) * 1000L - SPSS_DATE_OFFSET; + String newDatum = sdf_ymdhms.format(new Date(dateDatum)); + dbgLog.finer("k=" + k + ":" + newDatum); + casewiseRecordForTabFile.set(k, newDatum); + } else { + // decimal point included + String[] timeData = casewiseRecordForTabFile.get(k).toString().split("\\."); + + //dbgLog.finer(StringUtils.join(timeData, "|")); + long dateDatum = Long.parseLong(timeData[0]) * 1000L - SPSS_DATE_OFFSET; + StringBuilder sb_time = new StringBuilder( + sdf_ymdhms.format(new Date(dateDatum))); + //dbgLog.finer(sb_time.toString()); + + if (formatDecimalPointPosition > 0) { + sb_time.append("." + timeData[1].substring(0, formatDecimalPointPosition)); + } + dbgLog.finer("k=" + k + ":" + sb_time.toString()); + casewiseRecordForTabFile.set(k, sb_time.toString()); + } + } else if (printFormatTable.get(variableNameList.get(k)).equals("TIME")) { + if (casewiseRecordForTabFile.get(k).toString().indexOf(".") < 0) { + long dateDatum = Long.parseLong(casewiseRecordForTabFile.get(k).toString()) * 1000L; + String newDatum = sdf_hms.format(new Date(dateDatum)); + dbgLog.finer("k=" + k + ":" + newDatum); + casewiseRecordForTabFile.set(k, newDatum); + } else { + // decimal point included + String[] timeData = casewiseRecordForTabFile.get(k).toString().split("\\."); + + //dbgLog.finer(StringUtils.join(timeData, "|")); + long dateDatum = Long.parseLong(timeData[0]) * 1000L; + StringBuilder sb_time = new StringBuilder( + sdf_hms.format(new Date(dateDatum))); + //dbgLog.finer(sb_time.toString()); + + if (formatDecimalPointPosition > 0) { + sb_time.append("." + timeData[1].substring(0, formatDecimalPointPosition)); + } + dbgLog.finer("k=" + k + ":" + sb_time.toString()); + casewiseRecordForTabFile.set(k, sb_time.toString()); + } + } + + } else if (variableFormatType.equals("other")) { + dbgLog.finer("other non-date/time case:=" + i); + + if (printFormatTable.get(variableNameList.get(k)).equals("WKDAY")) { + // day of week + dbgLog.finer("data k=" + k + ":" + casewiseRecordForTabFile.get(k)); + dbgLog.finer("data k=" + k + ":" + SPSSConstants.WEEKDAY_LIST.get(Integer.valueOf(casewiseRecordForTabFile.get(k).toString()) - 1)); + String newDatum = SPSSConstants.WEEKDAY_LIST.get(Integer.valueOf(casewiseRecordForTabFile.get(k).toString()) - 1); + casewiseRecordForTabFile.set(k, newDatum); + dbgLog.finer("wkday:k=" + k + ":" + casewiseRecordForTabFile.get(k)); + } else if (printFormatTable.get(variableNameList.get(k)).equals("MONTH")) { + // month + dbgLog.finer("data k=" + k + ":" + casewiseRecordForTabFile.get(k)); + dbgLog.finer("data k=" + k + ":" + SPSSConstants.MONTH_LIST.get(Integer.valueOf(casewiseRecordForTabFile.get(k).toString()) - 1)); + String newDatum = SPSSConstants.MONTH_LIST.get(Integer.valueOf(casewiseRecordForTabFile.get(k).toString()) - 1); + casewiseRecordForTabFile.set(k, newDatum); + dbgLog.finer("month:k=" + k + ":" + casewiseRecordForTabFile.get(k)); + } + } + + + } // end: date-time-datum check + + + } // end: loop-k(2nd: variable-wise-check) + + + // write to tab file + if (casewiseRecordForTabFile.size() > 0) { + pwout.println(StringUtils.join(casewiseRecordForTabFile, "\t")); + } + + // numeric contents-check + for (int l = 0; l < casewiseRecordForTabFile.size(); l++) { + if (variableFormatTypeList[l].equals("date") + || variableFormatTypeList[l].equals("time") + || printFormatTable.get(variableNameList.get(l)).equals("WKDAY") + || printFormatTable.get(variableNameList.get(l)).equals("MONTH")) { + + } else { + if (variableTypelList.get(l) <= 0) { + if (casewiseRecordForTabFile.get(l).toString().indexOf(".") >= 0) { + decimalVariableSet.add(l); + } + } + } + } + + // reset the case-wise working objects + casewiseRecordForTabFile.clear(); + + if ( caseQnty > 0 ) { + if ( caseIndex == caseQnty ) { + hasReachedEOF = true; + } + } + + if (hasReachedEOF){ + break; + } + + } // if(The-end-of-a-case(row)-processing) + + } // loop-i (OBS unit) + + if ((hasReachedEOF) || (stream.available() == 0)) { + // reached the end of this file + // do exit-processing + + dbgLog.fine("***** reached the end of the file at " + ii + "th iteration *****"); + + break OBSERVATION; + } + + ii++; + + } // while loop + + pwout.close(); + } catch (IOException ex) { + throw ex; + } + + + dbgLog.info("<<<<<<"); + dbgLog.info("formatCategoryTable = " + formatCategoryTable); + dbgLog.info(">>>>>>"); + + + dbgLog.fine("decimalVariableSet=" + decimalVariableSet); + + dbgLog.fine("decodeRecordTypeDataCompressed(): end"); + } + + + void decodeRecordTypeDataUnCompressed(BufferedInputStream stream) throws IOException { + dbgLog.fine("***** decodeRecordTypeDataUnCompressed(): start *****"); + + if (stream ==null){ + throw new IllegalArgumentException("decodeRecordTypeDataUnCompressed: stream == null!"); + } + + int varQnty = dataTable.getVarQuantity().intValue(); + + + // + // set-up tab file + + PrintWriter pwout = createOutputWriter ( stream ); + + boolean hasStringVarContinuousBlock = + obsNonVariableBlockSet.size() > 0 ? true : false; + dbgLog.fine("hasStringVarContinuousBlock="+hasStringVarContinuousBlock); + + int ii = 0; + + int OBS = LENGTH_SAV_OBS_BLOCK; + int nOBS = OBSUnitsPerCase; + + dbgLog.fine("OBSUnitsPerCase="+OBSUnitsPerCase); + + int caseIndex = 0; + + dbgLog.fine("printFormatTable:\n"+printFormatTable); + + variableFormatTypeList = new String[varQnty]; + + for (int i = 0; i < varQnty; i++){ + variableFormatTypeList[i]=SPSSConstants.FORMAT_CATEGORY_TABLE.get( + printFormatTable.get(variableNameList.get(i))); + dbgLog.fine("i="+i+"th variableFormatTypeList="+variableFormatTypeList[i]); + formatCategoryTable.put(variableNameList.get(i), variableFormatTypeList[i]); + } + dbgLog.fine("variableFormatType:\n"+Arrays.deepToString(variableFormatTypeList)); + dbgLog.fine("formatCategoryTable:\n"+formatCategoryTable); + + int numberOfDecimalVariables = 0; + + // TODO: + // Make sure the date formats are actually preserved! + // (this is something that was collected in the code below and passed + // to the UNF calculator). + // -- L.A. 4.0 alpha + + List casewiseRecordForTabFile = new ArrayList(); + + + // missing values are written to the tab-delimited file by + // using the default or user-specified missing-value strings; + // however, to calculate UNF/summary statistics, + // classes for these calculations require their specific + // missing values that differ from the above missing-value + // strings; therefore, after row data for the tab-delimited + // file are written, missing values in a row are changed to + // UNF/summary-statistics-OK ones. + + // data-storage object for sumStat + ///dataTable2 = new Object[varQnty][caseQnty]; + // storage of date formats to pass to UNF + ///dateFormats = new String[varQnty][caseQnty]; + + try { + for (int i = 0; ; i++){ // case-wise loop + + byte[] buffer = new byte[OBS*nOBS]; + + int nbytesuc = stream.read(buffer); + + StringBuilder sb_stringStorage = new StringBuilder(""); + + for (int k=0; k < nOBS; k++){ + int offset= OBS*k; + + // uncompressed case + // numeric missing value == sysmis + // FF FF FF FF FF FF eF FF(little endian) + // string missing value + // 20 20 20 20 20 20 20 20 + // cf: compressed case + // numeric type:sysmis == 0xFF + // string type: missing value == 0xFE + // + + boolean isNumeric = OBSwiseTypelList.get(k)==0 ? true : false; + + if (isNumeric){ + dbgLog.finer(k+"-th variable is numeric"); + // interprete as double + ByteBuffer bb_double = ByteBuffer.wrap( + buffer, offset , LENGTH_SAV_OBS_BLOCK); + if (isLittleEndian){ + bb_double.order(ByteOrder.LITTLE_ENDIAN); + } + //char[] hexpattern = + String dphex = new String(Hex.encodeHex( + Arrays.copyOfRange(bb_double.array(), + offset, offset+LENGTH_SAV_OBS_BLOCK))); + dbgLog.finer("dphex="+ dphex); + + if ((dphex.equals("ffffffffffffefff"))|| + (dphex.equals("ffefffffffffffff"))){ + //casewiseRecordForTabFile.add(systemMissingValue); + // add the numeric missing value + dbgLog.fine("SAV Reader: adding: Missing Value (numeric)"); + casewiseRecordForTabFile.add(MissingValueForTextDataFileNumeric); + } else { + Double ddatum = bb_double.getDouble(); + dbgLog.fine("SAV Reader: adding: ddatum="+ddatum); + + // add this non-missing-value numeric datum + casewiseRecordForTabFile.add(doubleNumberFormatter.format(ddatum)) ; + } + + } else { + dbgLog.finer(k+"-th variable is string"); + // string case + // strip space-padding + // do not trim: string might have spaces within it + // the missing value (hex) for a string variable is: + // "20 20 20 20 20 20 20 20" + + + String strdatum = new String( + Arrays.copyOfRange(buffer, + offset, (offset+LENGTH_SAV_OBS_BLOCK)),defaultCharSet); + dbgLog.finer("str_datum="+strdatum); + // add this non-missing-value string datum + casewiseRecordForTabFile.add(strdatum); + + } // if isNumeric + + } // k-loop + + // String-variable's continuous block exits: + if (hasStringVarContinuousBlock){ + // continuous blocks: string case + // concatenating process + //dbgLog.fine("concatenating process starts"); + + //dbgLog.fine("casewiseRecordForTabFile(before)="+casewiseRecordForTabFile); + //dbgLog.fine("casewiseRecordForTabFile(before:size)="+casewiseRecordForTabFile.size()); + + StringBuilder sb = new StringBuilder(""); + int firstPosition = 0; + + Set removeJset = new HashSet(); + for (int j=0; j< nOBS; j++){ + dbgLog.finer("j="+j+"-th type ="+OBSwiseTypelList.get(j)); + if (OBSwiseTypelList.get(j) == -1){ + // String continued fount at j-th + // look back the j-1 + firstPosition = j-1; + int lastJ = j; + String concatanated = null; + + removeJset.add(j); + sb.append(casewiseRecordForTabFile.get(j-1)); + sb.append(casewiseRecordForTabFile.get(j)); + for (int jc =1; ; jc++ ){ + if (OBSwiseTypelList.get(j+jc) != -1){ + // j is the end unit of this string variable + concatanated = sb.toString(); + sb.setLength(0); + lastJ = j+jc; + break; + } else { + sb.append(casewiseRecordForTabFile.get(j+jc)); + removeJset.add(j+jc); + } + } + casewiseRecordForTabFile.set(j-1, concatanated); + + //out.println(j-1+"th concatanated="+concatanated); + j = lastJ -1; + + } // end-of-if: continuous-OBS only + } // end of loop-j + + List newDataLine = new ArrayList(); + + for (int jl=0; jl 0){ + sb_time.append("."+timeData[1].substring(0,formatDecimalPointPosition)); + } + + + dbgLog.finer("k="+k+":"+sb_time.toString()); + casewiseRecordForTabFile.set(k, sb_time.toString()); + } + } else if (printFormatTable.get(variableNameList.get(k)).equals("DATETIME")){ + + if (casewiseRecordForTabFile.get(k).toString().indexOf(".") < 0){ + long dateDatum = Long.parseLong(casewiseRecordForTabFile.get(k).toString())*1000L - SPSS_DATE_OFFSET; + String newDatum = sdf_ymdhms.format(new Date(dateDatum)); + dbgLog.finer("k="+k+":"+newDatum); + casewiseRecordForTabFile.set(k, newDatum); + } else { + // decimal point included + String[] timeData = casewiseRecordForTabFile.get(k).toString().split("\\."); + + //dbgLog.finer(StringUtils.join(timeData, "|")); + long dateDatum = Long.parseLong(timeData[0])*1000L- SPSS_DATE_OFFSET; + StringBuilder sb_time = new StringBuilder( + sdf_ymdhms.format(new Date(dateDatum))); + //dbgLog.finer(sb_time.toString()); + + if (formatDecimalPointPosition > 0){ + sb_time.append("."+timeData[1].substring(0,formatDecimalPointPosition)); + } + dbgLog.finer("k="+k+":"+sb_time.toString()); + casewiseRecordForTabFile.set(k, sb_time.toString()); + } + } else if (printFormatTable.get(variableNameList.get(k)).equals("TIME")){ + if (casewiseRecordForTabFile.get(k).toString().indexOf(".") < 0){ + long dateDatum = Long.parseLong(casewiseRecordForTabFile.get(k).toString())*1000L; + String newDatum = sdf_hms.format(new Date(dateDatum)); + dbgLog.finer("k="+k+":"+newDatum); + casewiseRecordForTabFile.set(k, newDatum); + } else { + // decimal point included + String[] timeData = casewiseRecordForTabFile.get(k).toString().split("\\."); + + //dbgLog.finer(StringUtils.join(timeData, "|")); + long dateDatum = Long.parseLong(timeData[0])*1000L; + StringBuilder sb_time = new StringBuilder( + sdf_hms.format(new Date(dateDatum))); + //dbgLog.finer(sb_time.toString()); + + if (formatDecimalPointPosition > 0){ + sb_time.append("."+timeData[1].substring(0,formatDecimalPointPosition)); + } + dbgLog.finer("k="+k+":"+sb_time.toString()); + casewiseRecordForTabFile.set(k, sb_time.toString()); + } + } + } else if (variableFormatType.equals("other")){ + dbgLog.finer("other non-date/time case"); + + if (printFormatTable.get(variableNameList.get(k)).equals("WKDAY")){ + // day of week + dbgLog.finer("data k="+k+":"+casewiseRecordForTabFile.get(k)); + dbgLog.finer("data k="+k+":"+SPSSConstants.WEEKDAY_LIST.get(Integer.valueOf(casewiseRecordForTabFile.get(k).toString())-1)); + String newDatum = SPSSConstants.WEEKDAY_LIST.get(Integer.valueOf(casewiseRecordForTabFile.get(k).toString())-1); + casewiseRecordForTabFile.set(k, newDatum); + dbgLog.finer("wkday:k="+k+":"+casewiseRecordForTabFile.get(k)); + } else if (printFormatTable.get(variableNameList.get(k)).equals("MONTH")){ + // month + dbgLog.finer("data k="+k+":"+casewiseRecordForTabFile.get(k)); + dbgLog.finer("data k="+k+":"+SPSSConstants.MONTH_LIST.get(Integer.valueOf(casewiseRecordForTabFile.get(k).toString())-1)); + String newDatum = SPSSConstants.MONTH_LIST.get(Integer.valueOf(casewiseRecordForTabFile.get(k).toString())-1); + casewiseRecordForTabFile.set(k, newDatum); + dbgLog.finer("month:k="+k+":"+casewiseRecordForTabFile.get(k)); + + } + } + // end of date/time block + } // end: date-time-datum check + + } // end: loop-k(2nd: variablte-wise-check) + + // write to tab file + if (casewiseRecordForTabFile.size() > 0) { + pwout.println(StringUtils.join(casewiseRecordForTabFile, "\t")); + } + + // numeric contents-check + for (int l = 0; l < casewiseRecordForTabFile.size(); l++){ + if ( variableFormatTypeList[l].equals("date") || + variableFormatTypeList[l].equals("time") || + printFormatTable.get(variableNameList.get(l)).equals("WKDAY") || + printFormatTable.get(variableNameList.get(l)).equals("MONTH") ) { + + } else { + if (variableTypelList.get(l) <= 0) { + if (casewiseRecordForTabFile.get(l).toString().indexOf(".") >= 0){ + decimalVariableSet.add(l); + } + } + } + } + + // reset the case-wise working objects + casewiseRecordForTabFile.clear(); + + if (stream.available() == 0){ + // reached the end of this file + // do exit-processing + + dbgLog.fine("reached the end of the file at "+ii + +"th iteration"); + + break; + } // if eof processing + } //i-loop: case(row) iteration + + // close the writer + pwout.close(); + + + } catch (IOException ex) { + throw ex; + } + + // contents check + dbgLog.fine("numberOfDecimalVariables="+numberOfDecimalVariables); + dbgLog.fine("decimalVariableSet="+decimalVariableSet); + + dbgLog.fine("***** decodeRecordTypeDataUnCompressed(): end *****"); + } + + // Utility Methods -----------------------------------------------------// + + private boolean variableNameIsAnIncrement (String varNameBase, String variableName){ + if ( varNameBase == null ) { + return false; + } + + if ( varNameBase.concat("0").equals(variableName) ) { + return true; + } + + return false; + } + + private boolean variableNameIsAnIncrement (String varNameBase, String lastExtendedVariable, String currentVariable) { + + if ( varNameBase == null || + lastExtendedVariable == null || + currentVariable == null ) { + return false; + } + + if ( varNameBase.length() >= lastExtendedVariable.length() ) { + return false; + } + + if ( varNameBase.length() >= currentVariable.length() ) { + return false; + } + + if ( !(varNameBase.equals(currentVariable.substring(0,varNameBase.length()))) ) { + return false; + } + + String lastSuffix = lastExtendedVariable.substring(varNameBase.length()); + String currentSuffix = currentVariable.substring(varNameBase.length()); + + if ( currentSuffix.length() > 2 ) { + return false; + } + + //if ( !currentSuffix.matches("^[0-9A-Z]*$") ) { + // return false; + //} + + return suffixIsAnIncrement (lastSuffix, currentSuffix); + } + + + private boolean suffixIsAnIncrement ( String lastSuffix, String currentSuffix ) { + // Extended variable suffixes are base-36 number strings in the + // [0-9A-Z] alphabet. I.e. the incremental suffixes go from + // 0 to 9 to A to Z to 10 to 1Z ... etc. + + int lastSuffixValue = intBase36 ( lastSuffix ); + int currentSuffixValue = intBase36 ( currentSuffix ); + + if ( currentSuffixValue - lastSuffixValue > 0 ) { + return true; + } + + return false; + } + + private int intBase36 ( String stringBase36 ) { + + // integer value of a base-36 string in [0-9A-Z] alphabet; + // i.e. "0" = 0, "9" = 9, "A" = 10, + // "Z" = 35, "10" = 36, "1Z" = 71 ... + + byte[] stringBytes = stringBase36.getBytes(); + + int ret = 0; + + for ( int i = 0; i < stringBytes.length; i++ ) { + int value = 0; + if (stringBytes[i] >= 48 && stringBytes[i] <= 57 ) { + // [0-9] + value = (int)stringBytes[i] - 48; + } else if (stringBytes[i] >= 65 && stringBytes[i] <= 90 ) { + // [A-Z] + value = (int)stringBytes[i] - 55; + } + + ret = (ret * 36) + value; + } + + return ret; + } + + + private int getSAVintAdjustedBlockLength(int rawLength){ + int adjustedLength = rawLength; + if ((rawLength%LENGTH_SAV_INT_BLOCK ) != 0){ + adjustedLength = + LENGTH_SAV_INT_BLOCK*(rawLength/LENGTH_SAV_INT_BLOCK +1) ; + } + return adjustedLength; + } + + private int getSAVobsAdjustedBlockLength(int rawLength){ + int adjustedLength = rawLength; + if ((rawLength%LENGTH_SAV_OBS_BLOCK ) != 0){ + adjustedLength = + LENGTH_SAV_OBS_BLOCK*(rawLength/LENGTH_SAV_OBS_BLOCK +1) ; + } + return adjustedLength; + } + + + private int[] parseRT7SubTypefieldHeader(BufferedInputStream stream) throws IOException { + int length_unit_length = 4; + int length_number_of_units = 4; + int storage_size = length_unit_length + length_number_of_units; + + int[] headerSection = new int[2]; + + byte[] byteStorage = new byte[storage_size]; + + try { + int nbytes = stream.read(byteStorage); + // to-do check against nbytes + + //printHexDump(byteStorage, "RT7:storage"); + + ByteBuffer bb_data_type = ByteBuffer.wrap(byteStorage, + 0, length_unit_length); + if (isLittleEndian){ + bb_data_type.order(ByteOrder.LITTLE_ENDIAN); + } + + int unitLength = bb_data_type.getInt(); + dbgLog.fine("parseRT7 SubTypefield: unitLength="+unitLength); + + ByteBuffer bb_number_of_units = ByteBuffer.wrap(byteStorage, + length_unit_length, length_number_of_units); + if (isLittleEndian){ + bb_number_of_units.order(ByteOrder.LITTLE_ENDIAN); + } + + int numberOfUnits = bb_number_of_units.getInt(); + dbgLog.fine("parseRT7 SubTypefield: numberOfUnits="+numberOfUnits); + + headerSection[0] = unitLength; + headerSection[1] = numberOfUnits; + return headerSection; + } catch (IOException ex) { + throw ex; + } + } + + private void parseRT7SubTypefield(BufferedInputStream stream) throws IOException { + int length_unit_length = 4; + int length_number_of_units = 4; + int storage_size = length_unit_length + length_number_of_units; + + int[] headerSection = new int[2]; + + byte[] byteStorage = new byte[storage_size]; + + try{ + int nbytes = stream.read(byteStorage); + // to-do check against nbytes + + //printHexDump(byteStorage, "RT7:storage"); + + ByteBuffer bb_data_type = ByteBuffer.wrap(byteStorage, + 0, length_unit_length); + if (isLittleEndian){ + bb_data_type.order(ByteOrder.LITTLE_ENDIAN); + } + + int unitLength = bb_data_type.getInt(); + dbgLog.fine("parseRT7 SubTypefield: unitLength="+unitLength); + + ByteBuffer bb_number_of_units = ByteBuffer.wrap(byteStorage, + length_unit_length, length_number_of_units); + if (isLittleEndian){ + bb_number_of_units.order(ByteOrder.LITTLE_ENDIAN); + } + + int numberOfUnits = bb_number_of_units.getInt(); + dbgLog.fine("parseRT7 SubTypefield: numberOfUnits="+numberOfUnits); + + headerSection[0] = unitLength; + headerSection[1] = numberOfUnits; + + for (int i=0; i getRT7SubTypefieldData(BufferedInputStream stream) throws IOException { + int length_unit_length = 4; + int length_number_of_units = 4; + int storage_size = length_unit_length + length_number_of_units; + List dataList = new ArrayList(); + int[] headerSection = new int[2]; + + byte[] byteStorage = new byte[storage_size]; + + try{ + int nbytes = stream.read(byteStorage); + // to-do check against nbytes + + //printHexDump(byteStorage, "RT7:storage"); + + ByteBuffer bb_data_type = ByteBuffer.wrap(byteStorage, + 0, length_unit_length); + if (isLittleEndian){ + bb_data_type.order(ByteOrder.LITTLE_ENDIAN); + } + + int unitLength = bb_data_type.getInt(); + dbgLog.fine("parseRT7SubTypefield: unitLength="+unitLength); + + ByteBuffer bb_number_of_units = ByteBuffer.wrap(byteStorage, + length_unit_length, length_number_of_units); + if (isLittleEndian){ + bb_number_of_units.order(ByteOrder.LITTLE_ENDIAN); + } + + int numberOfUnits = bb_number_of_units.getInt(); + dbgLog.fine("parseRT7SubTypefield: numberOfUnits="+numberOfUnits); + + headerSection[0] = unitLength; + headerSection[1] = numberOfUnits; + + for (int i=0; iInteger value to + * SPSS SAV data-format code. + */ + public static Map FORMAT_CODE_TABLE_SAV = + new LinkedHashMap(); + + /** + * A mapping table from a Integer value to + * SPSS POR data-format code. + *

Note: after 17, SPSS POR and SAV formats no longer + * coincide. + */ + public static Map FORMAT_CODE_TABLE_POR = + new LinkedHashMap(); + + /** + * A mapping table that groups data-formats into three categories + * (date, time, and other). + */ + public static Map FORMAT_CATEGORY_TABLE = + new LinkedHashMap(); + + private static List ORDINARY_FORMAT_CODE = Arrays.asList(0, 1, 5); + + /** + * A Set instance that tells whether a given format code + * is not a date/time type. + */ + public static final Set ORDINARY_FORMAT_CODE_SET = + new LinkedHashSet(ORDINARY_FORMAT_CODE); + + /** + * A String array of short weekday names in English + */ + public static String[] WEEKDAYS = {"Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"}; + + /** + * A String array of short month names in English + */ + public static String[] MONTHS = {"Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"}; + + /** + * A mapping table from an Integer to + * a short-weekday name in English. + */ + public static final Map WEEKDAY_LIST= new LinkedHashMap(); + /** + * A mapping table from an Integer to + * a short-month name in English. + */ + public static final Map MONTH_LIST= new LinkedHashMap(); + + static{ + for (int i=0; i< FORMAT_KEYS_SAV.length; i++){ + FORMAT_CODE_TABLE_SAV.put(FORMAT_KEYS_SAV[i], FORMAT_VALUES[i]); + FORMAT_CODE_TABLE_POR.put(FORMAT_KEYS_POR[i], FORMAT_VALUES[i]); + FORMAT_CATEGORY_TABLE.put(FORMAT_VALUES[i], FORMAT_CATEGORIES[i]); + + } + + for (int i=0; i< WEEKDAYS.length;i++){ + WEEKDAY_LIST.put(i, WEEKDAYS[i]); + } + for (int i=0; i< MONTHS.length;i++){ + MONTH_LIST.put(i, MONTHS[i]); + } + + } + + +} diff --git a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java index ab60018a4a3..d686417ac3e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java @@ -128,10 +128,12 @@ public DatasetField parseField( JsonObject json ) throws JsonParseException { try { DatasetField ret = new DatasetField(); DatasetFieldType type = datasetFieldSvc.findByName(json.getString("typeName","")); - + + if ( type == null ) { + throw new NoResultException("Can't find type '" + json.getString("typeName","") +"'"); + } ret.setDatasetFieldType(type); - if ( type.isCompound() ) { List vals = parseCompoundValue(type, json); for ( DatasetFieldCompoundValue dsfcv : vals ) { diff --git a/src/main/webapp/dataset.xhtml b/src/main/webapp/dataset.xhtml index a0c66840f7a..6f0addf7671 100644 --- a/src/main/webapp/dataset.xhtml +++ b/src/main/webapp/dataset.xhtml @@ -55,7 +55,7 @@

-
+
@@ -66,13 +66,13 @@
- -
  • - - - -
  • -
    -
    + + + + + + + + + + +
    - - - + + + - + Select version number: - + Due to the nature of changes to the current draft this will be a major release (#{DatasetPage.datasetNextMajorVersion}) - + + + + + +
    + + + +
    +
    + +
    + + + +
    @@ -176,30 +204,31 @@
    - +
    - #{DatasetPage.datasetVersionUI.title.value} + #{DatasetPage.datasetVersionUI.title.value} +
    - #{DatasetPage.displayCitation} -
    -
    - -
    - - In Draft - + #{DatasetPage.displayCitation} + + + + + +
    #{DatasetPage.datasetVersionUI.description.value} - -
    + +
    +
    Keyword(s) @@ -286,9 +315,9 @@ + and permissionServiceBean.on(DatasetPage.dataset).canIssueCommand('UpdateDatasetCommand')}">
    - + Upload + Edit Files @@ -301,7 +330,7 @@ - + @@ -310,7 +339,7 @@ - + @@ -407,9 +436,9 @@ + and permissionServiceBean.on(DatasetPage.dataset).canIssueCommand('UpdateDatasetCommand')}">
    - + Add + Edit Metadata @@ -450,7 +479,7 @@ - + diff --git a/src/main/webapp/datasetFieldForEditFragment.xhtml b/src/main/webapp/datasetFieldForEditFragment.xhtml index 85cc1cf1e6b..1fc4d97ecda 100644 --- a/src/main/webapp/datasetFieldForEditFragment.xhtml +++ b/src/main/webapp/datasetFieldForEditFragment.xhtml @@ -6,33 +6,45 @@ xmlns:p="http://primefaces.org/ui"> - - - - - - + + + + + + + + +
    + #{dsfv.datasetField.validationMessage} +
    +
    + + +
    + #{dsfv.validationMessage} +
    +
    - + - - - + +
    \ No newline at end of file diff --git a/src/main/webapp/dataverse.xhtml b/src/main/webapp/dataverse.xhtml index 985dfd58b6e..d475b6bccd9 100644 --- a/src/main/webapp/dataverse.xhtml +++ b/src/main/webapp/dataverse.xhtml @@ -24,9 +24,9 @@ setCount(this, elem); }); function setCount(src, elem) { - if (typeof src === "undefined") { - return; - } + if (typeof src === "undefined") { + return; + } var chars = src.value.length; elem.html(limit - chars); } @@ -66,6 +66,7 @@ + @@ -80,10 +81,14 @@ - + + + + - + itemDisabled="#{dv eq DataversePage.dataverse or dv.owners.contains(DataversePage.dataverse)}" /> + + @@ -97,7 +102,7 @@
    - +
    @@ -298,7 +303,7 @@ - + @@ -358,7 +363,6 @@ - diff --git a/src/main/webapp/dataverse_header.xhtml b/src/main/webapp/dataverse_header.xhtml index fdc31d1ec6c..1262c913c3c 100644 --- a/src/main/webapp/dataverse_header.xhtml +++ b/src/main/webapp/dataverse_header.xhtml @@ -93,13 +93,12 @@