String
that represents the numeric missing value
+ * in the final tab-delimited data file.
+ */
+ private String MissingValueForTextDataFileNumeric = "";
+
+
+ public String getMissingValueForTextDataFileNumeric() {
+ return MissingValueForTextDataFileNumeric;
+ }
+
+
+ public void setMissingValueForTextDataFileNumeric(String MissingValueToken) {
+ this.MissingValueForTextDataFileNumeric = MissingValueToken;
+ }
+
+
+ String MissingValueForTextDataFileString = "";
+
+
+ public String getMissingValueForTextDataFileString() {
+ return MissingValueForTextDataFileString;
+ }
+
+
+ public void setMissingValueForTextDataFileString(String MissingValueToken) {
+ this.MissingValueForTextDataFileString = MissingValueToken;
+ }
+
+
+ public SAVFileReader(TabularDataFileReaderSpi originator){
+ super(originator);
+ }
+
+ // Methods ---------------------------------------------------------------//
+
+ private void init() throws IOException {
+
+ Context ctx = null;
+ try {
+ ctx = new InitialContext();
+ varService = (VariableServiceBean) ctx.lookup("java:global/dataverse-4.0/VariableServiceBean");
+ } catch (NamingException nex) {
+ try {
+ ctx = new InitialContext();
+ varService = (VariableServiceBean) ctx.lookup("java:global/dataverse/VariableServiceBean");
+ } catch (NamingException nex2) {
+ if (dbgLog.isLoggable(Level.INFO)) dbgLog.info("Could not look up initial context, or the variable service in JNDI!");
+ throw new IOException ("Could not look up initial context, or the variable service in JNDI!");
+ }
+ }
+
+ sdf_ymd.setTimeZone(TimeZone.getTimeZone("GMT"));
+ sdf_ymdhms.setTimeZone(TimeZone.getTimeZone("GMT"));
+ sdf_dhms.setTimeZone(TimeZone.getTimeZone("GMT"));
+ sdf_hms.setTimeZone(TimeZone.getTimeZone("GMT"));
+
+ doubleNumberFormatter.setGroupingUsed(false);
+ doubleNumberFormatter.setMaximumFractionDigits(340);
+
+ if (getDataLanguageEncoding() != null) {
+ defaultCharSet = getDataLanguageEncoding();
+ }
+ }
+
+ public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws IOException{
+ dbgLog.info("SAVFileReader: read() start");
+
+ if (dataFile != null) {
+ throw new IOException ("this plugin does not support external raw data files");
+ }
+
+ /* ingest happens here ... */
+
+ // the following methods are now executed, in this order:
+
+ // decodeHeader -- this method doesn't read any [meta]data and
+ // doesn't initialize any values; its only purpose is to
+ // make sure that the file is indeed an SPSS/SAV file.
+ //
+ // decodeRecordType1 -- there's always one RT1 record; it is
+ // always 176 byte long. it contains the very basic metadata
+ // about the data file. most notably, the number of observations
+ // and the number of OBS (8 byte values) per observation.
+ //
+ // decodeRecordType2 -- there are multiple RT2 records. there's
+ // one RT2 for every OBS (8 byte value); i.e. one per variable,
+ // or more per every String variable split into multiple OBS
+ // segments. this one is a 400 line method, that may benefit
+ // from being split into smaller methods.
+ //
+ // decodeRecordType3and4 -- these sections come in pairs, each
+ // pair dedicated to one set of variable labels.
+ // decodeRecordType6,
+ //
+ // decodeRecordType7 -- this RT contains some extended
+ // metadata for the data file. (including the information
+ // about the extended variables, i.e. variables longer than
+ // 255 bytes split into 255 byte fragments that are stored
+ // in the data file as independent variables).
+ //
+ // decodeRecordType999 -- this RT does not contain any data;
+ // its sole function is to indicate that the metadata portion
+ // of the data file is over and the data section follows.
+ //
+ // decodeRecordTypeData -- this method decodes the data section
+ // of the file. Inside this method, 2 distinct methods are
+ // called to process compressed or uncompressed data, depending
+ // on which method is used in this data file.
+
+
+ String methodCurrentlyExecuted = null;
+
+ try {
+ methodCurrentlyExecuted = "decodeHeader";
+ dbgLog.fine("***** SAVFileReader: executing method decodeHeader");
+ decodeHeader(stream);
+
+ methodCurrentlyExecuted = "decodeRecordType1";
+ dbgLog.fine("***** SAVFileReader: executing method decodeRecordType1");
+ decodeRecordType1(stream);
+
+ methodCurrentlyExecuted = "decodeRecordType2";
+ dbgLog.fine("***** SAVFileReader: executing method decodeRecordType1");
+ decodeRecordType2(stream);
+
+ methodCurrentlyExecuted = "decodeRecordType3and4";
+ dbgLog.fine("***** SAVFileReader: executing method decodeRecordType3and4");
+ decodeRecordType3and4(stream);
+
+ methodCurrentlyExecuted = "decodeRecordType6";
+ dbgLog.fine("***** SAVFileReader: executing method decodeRecordType6");
+ decodeRecordType6(stream);
+
+ methodCurrentlyExecuted = "decodeRecordType7";
+ dbgLog.fine("***** SAVFileReader: executing method decodeRecordType7");
+ decodeRecordType7(stream);
+
+ methodCurrentlyExecuted = "decodeRecordType999";
+ dbgLog.fine("***** SAVFileReader: executing method decodeRecordType999");
+ decodeRecordType999(stream);
+
+ methodCurrentlyExecuted = "decodeRecordTypeData";
+ dbgLog.fine("***** SAVFileReader: executing method decodeRecordTypeData");
+ decodeRecordTypeData(stream);
+
+
+ } catch (IllegalArgumentException e) {
+ //Throwable cause = e.getCause();
+ dbgLog.fine("***** SAVFileReader: ATTENTION: IllegalArgumentException thrown while executing "+methodCurrentlyExecuted);
+ e.printStackTrace();
+ throw new IllegalArgumentException ( "in method "+methodCurrentlyExecuted+": "+e.getMessage() );
+ } catch (IOException e) {
+ dbgLog.fine("***** SAVFileReader: ATTENTION: IOException thrown while executing "+methodCurrentlyExecuted);
+ e.printStackTrace();
+ throw new IOException ( "in method "+methodCurrentlyExecuted+": "+e.getMessage() );
+ }
+
+ /*
+ * Final variable type assignments;
+ * TODO: (maybe?)
+ * Instead of doing it here, perhaps all the type assignments need to
+ * be done on DataVariable objects directly; without relying on
+ * maps and lists here... -- L.A. 4.0 beta (?)
+ */
+
+ for (int indx = 0; indx < variableTypelList.size(); indx++) {
+ String varName = dataTable.getDataVariables().get(indx).getName();
+ int simpleType = 0;
+ if (variableTypelList.get(indx) != null) {
+ simpleType = variableTypelList.get(indx).intValue();
+ }
+
+ if (simpleType <= 0) {
+ // We need to make one last type adjustment:
+ // Dates and Times will be stored as character values in the
+ // dataverse tab files; even though they are not typed as
+ // strings at this point:
+ // TODO:
+ // Make sure the date/time format is properly preserved!
+ // (see the setFormatCategory below... but double-check!)
+ // -- L.A. 4.0 alpha
+ String variableFormatType = variableFormatTypeList[indx];
+ if (variableFormatType != null
+ && (variableFormatType.equals("time")
+ || variableFormatType.equals("date"))) {
+ ///variableTypeMinimal[indx] = 1;
+ simpleType = 1;
+
+ String formatCategory = formatCategoryTable.get(varName);
+
+ if (formatCategory != null) {
+ dataTable.getDataVariables().get(indx).setFormatCategory(formatCategory);
+ }
+ }
+ }
+
+ // OK, we can now assign the types:
+
+ if (simpleType > 0) {
+ // String:
+ dataTable.getDataVariables().get(indx).setVariableFormatType(varService.findVariableFormatTypeByName("character"));
+ dataTable.getDataVariables().get(indx).setVariableIntervalType(varService.findVariableIntervalTypeByName("discrete"));
+ } else {
+ // Numeric:
+ dataTable.getDataVariables().get(indx).setVariableFormatType(varService.findVariableFormatTypeByName("numeric"));
+ // discrete or continuous?
+ // "decimal variables" become dataverse data variables of interval type "continuous":
+
+ if (decimalVariableSet.contains(indx)) {
+ dataTable.getDataVariables().get(indx).setVariableIntervalType(varService.findVariableIntervalTypeByName("continuous"));
+ } else {
+ dataTable.getDataVariables().get(indx).setVariableIntervalType(varService.findVariableIntervalTypeByName("discrete"));
+ }
+
+ }
+
+ // TODO: take care of the SPSS "shortToLongVariableNameTable"
+ // mapping before returning the ingested data object. -- 4.0 alpha
+ // (done, below - but verify!)
+
+ if (shortToLongVariableNameTable.containsKey(varName)) {
+ String longName = shortToLongVariableNameTable.get(varName);
+ if (longName != null && !longName.equals("")) {
+ dataTable.getDataVariables().get(indx).setName(longName);
+ }
+ }
+
+ }
+
+ ingesteddata.setDataTable(dataTable);
+
+ dbgLog.info("SAVFileReader: read() end");
+ return ingesteddata;
+ }
+
+ void decodeHeader(BufferedInputStream stream) throws IOException {
+ dbgLog.fine("decodeHeader(): start");
+
+ if (stream ==null){
+ throw new IllegalArgumentException("stream == null!");
+ }
+ // the length of the magic number is 4 (1-byte character * 4)
+ // its value is expected to be $FL2
+
+ byte[] b = new byte[SAV_MAGIC_NUMBER_LENGTH];
+
+ try {
+ if (stream.markSupported()){
+ stream.mark(100);
+ }
+ int nbytes = stream.read(b, 0, SAV_MAGIC_NUMBER_LENGTH);
+
+ if (nbytes == 0){
+ throw new IOException();
+ }
+
+ } catch (IOException ex){
+ //ex.printStackTrace();
+ throw ex;
+ }
+
+ //printHexDump(b, "hex dump of the byte-array");
+
+ String hdr4sav = new String(b);
+ dbgLog.fine("from string=" + hdr4sav);
+
+ if (hdr4sav.equals(SAV_FILE_SIGNATURE)) {
+ dbgLog.fine("this file is spss-sav type");
+ // initialize version-specific parameter
+ init();
+
+ dataTable.setOriginalFileFormat(MIME_TYPE[0]);
+
+ dataTable.setUnf("UNF:6:NOTCALCULATED");
+
+
+ } else {
+ dbgLog.fine("this file is NOT spss-sav type");
+
+ throw new IllegalArgumentException("given file is not spss-sav type");
+ }
+
+ // TODO:
+ // Decide what to do with the charset, where should it be stored?
+ // -- 4.0 alpha
+ //4.0//smd.getFileInformation().put("charset", defaultCharSet);
+ dbgLog.fine("***** decodeHeader(): end *****");
+
+ }
+
+
+ void decodeRecordType1(BufferedInputStream stream) throws IOException {
+ dbgLog.fine("***** decodeRecordType1(): start *****");
+
+ if (stream ==null){
+ throw new IllegalArgumentException("stream == null!");
+ }
+ // how to read each recordType
+ // 1. set-up the following objects before reading bytes
+ // a. the working byte array
+ // b. the storage object
+ // the length of this field: 172bytes = 60 + 4 + 12 + 4 + 8 + 84
+ // this field consists of 6 distinct blocks
+
+ byte[] recordType1 = new byte[LENGTH_RECORDTYPE1];
+ // int caseWeightVariableOBSIndex = 0;
+
+ try {
+ int nbytes = stream.read(recordType1, 0, LENGTH_RECORDTYPE1);
+
+
+ //printHexDump(recordType1, "recordType1");
+
+ if (nbytes == 0){
+ throw new IOException("reading recordType1: no byte was read");
+ }
+
+ // 1.1 60 byte-String that tells the platform/version of SPSS that
+ // wrote this file
+
+ int offset_start = 0;
+ int offset_end = LENGTH_SPSS_PRODUCT_INFO; // 60 bytes
+
+ String productInfo = new String(Arrays.copyOfRange(recordType1, offset_start,
+ offset_end),"US-ASCII");
+
+ dbgLog.fine("productInfo:\n"+productInfo+"\n");
+
+ // try to parse out the SPSS version that created this data
+ // file:
+
+ String spssVersionNumberTag = null;
+
+ String regexpVersionNumber = ".*Release ([0-9]*)";
+ Pattern versionTagPattern = Pattern.compile(regexpVersionNumber);
+ Matcher matcher = versionTagPattern.matcher(productInfo);
+ if ( matcher.find() ) {
+ spssVersionNumberTag = matcher.group(1);
+ dbgLog.fine("SPSS Version Number: "+spssVersionNumberTag);
+ dataTable.setOriginalFormatVersion(spssVersionNumberTag);
+ }
+
+ if (spssVersionNumberTag != null && !spssVersionNumberTag.equals("")) {
+ spssVersionNumber = Integer.valueOf(spssVersionNumberTag).intValue();
+
+
+ /*
+ * Starting with SPSS version 16, the default encoding is
+ * UTF-8.
+ * But we are only going to use it if the user did not explicitly
+ * specify the encoding on the addfiles page. Then we'd want
+ * to stick with whatever they entered.
+ */
+ if (spssVersionNumber > 15) {
+ if (getDataLanguageEncoding() == null) {
+ defaultCharSet = "UTF-8";
+ }
+ }
+ }
+
+ // TODO:
+ // decide what to do with the charset? -- 4.0 alpha
+ //4.0//smd.getFileInformation().put("charset", defaultCharSet);
+
+ // 1.2) 4-byte file-layout-code (byte-order)
+
+ offset_start = offset_end;
+ offset_end += LENGTH_FILE_LAYOUT_CODE; // 4 byte
+
+ ByteBuffer bb_fileLayout_code = ByteBuffer.wrap(
+ recordType1, offset_start, LENGTH_FILE_LAYOUT_CODE);
+
+ ByteBuffer byteOderTest = bb_fileLayout_code.duplicate();
+ // interprete the 4 byte as int
+
+ int int2test = byteOderTest.getInt();
+
+ if (int2test == 2 || int2test == 3){
+ dbgLog.fine("integer == "+int2test+": the byte-oder of the writer is the same "+
+ "as the counterpart of Java: Big Endian");
+ } else {
+ // Because Java's byte-order is always big endian,
+ // this(!=2) means this sav file was written on a little-endian machine
+ // non-string, multi-bytes blocks must be byte-reversed
+
+ bb_fileLayout_code.order(ByteOrder.LITTLE_ENDIAN);
+
+ int2test = bb_fileLayout_code.getInt();
+
+ if (int2test == 2 || int2test == 3){
+ dbgLog.fine("The sav file was saved on a little endian machine");
+ dbgLog.fine("Reveral of the bytes is necessary to decode "+
+ "multi-byte, non-string blocks");
+
+ isLittleEndian = true;
+
+ } else {
+ throw new IOException("reading recordType1:unknown file layout code="+int2test);
+ }
+ }
+
+ dbgLog.fine("Endian of this platform:"+ByteOrder.nativeOrder().toString());
+
+ // 1.3 4-byte Number_Of_OBS_Units_Per_Case
+ // (= how many RT2 records => how many varilables)
+
+ offset_start = offset_end;
+ offset_end += LENGTH_NUMBER_OF_OBS_UNITS_PER_CASE; // 4 byte
+
+ ByteBuffer bb_OBS_units_per_case = ByteBuffer.wrap(
+ recordType1, offset_start,LENGTH_NUMBER_OF_OBS_UNITS_PER_CASE);
+
+ if (isLittleEndian){
+ bb_OBS_units_per_case.order(ByteOrder.LITTLE_ENDIAN);
+ }
+
+
+ OBSUnitsPerCase = bb_OBS_units_per_case.getInt();
+
+ dbgLog.fine("RT1: OBSUnitsPerCase="+OBSUnitsPerCase);
+
+ // 1.4 4-byte Compression_Switch
+
+ offset_start = offset_end;
+ offset_end += LENGTH_COMPRESSION_SWITCH; // 4 byte
+
+ ByteBuffer bb_compression_switch = ByteBuffer.wrap(recordType1,
+ offset_start, LENGTH_COMPRESSION_SWITCH);
+
+ if (isLittleEndian){
+ bb_compression_switch.order(ByteOrder.LITTLE_ENDIAN);
+ }
+
+ int compression_switch = bb_compression_switch.getInt();
+ if ( compression_switch == 0){
+ // data section is not compressed
+ isDataSectionCompressed = false;
+ dbgLog.fine("data section is not compressed");
+ } else {
+ dbgLog.fine("data section is compressed:"+compression_switch);
+ }
+
+ // 1.5 4-byte Case-Weight Variable Index
+ // warning: this variable index starts from 1, not 0
+
+ offset_start = offset_end;
+ offset_end += LENGTH_CASE_WEIGHT_VARIABLE_INDEX; // 4 byte
+
+ ByteBuffer bb_Case_Weight_Variable_Index = ByteBuffer.wrap(recordType1,
+ offset_start, LENGTH_CASE_WEIGHT_VARIABLE_INDEX);
+
+ if (isLittleEndian){
+ bb_Case_Weight_Variable_Index.order(ByteOrder.LITTLE_ENDIAN);
+ }
+
+ caseWeightVariableOBSIndex = bb_Case_Weight_Variable_Index.getInt();
+
+ /// caseWeightVariableOBSIndex will be used later on to locate
+ /// the weight variable; so we'll be able to mark the corresponding
+ /// variables properly.
+ // TODO: make sure case weight variables are properly handled!
+ // -- L.A. 4.0 beta
+ ///smd.getFileInformation().put("caseWeightVariableOBSIndex", caseWeightVariableOBSIndex);
+
+ // 1.6 4-byte Number of Cases
+
+ offset_start = offset_end;
+ offset_end += LENGTH_NUMBER_OF_CASES; // 4 byte
+
+ ByteBuffer bb_Number_Of_Cases = ByteBuffer.wrap(recordType1,
+ offset_start, LENGTH_NUMBER_OF_CASES);
+
+ if (isLittleEndian){
+ bb_Number_Of_Cases.order(ByteOrder.LITTLE_ENDIAN);
+ }
+
+ Long numberOfCases = bb_Number_Of_Cases.getLong();
+
+ if ( numberOfCases < 0){
+ // -1 if numberOfCases is unknown
+ throw new RuntimeException("number of cases is not recorded in the header");
+ } else {
+ dbgLog.fine("RT1: number of cases is recorded= "+numberOfCases);
+ dataTable.setCaseQuantity(numberOfCases);
+ ///caseQnty = numberOfCases;
+ ///smd.getFileInformation().put("caseQnty", numberOfCases);
+ }
+
+ // 1.7 8-byte compression-bias [not long but double]
+
+ offset_start = offset_end;
+ offset_end += LENGTH_COMPRESSION_BIAS; // 8 byte
+
+ ByteBuffer bb_compression_bias = ByteBuffer.wrap(
+ Arrays.copyOfRange(recordType1, offset_start,
+ offset_end));
+
+ if (isLittleEndian){
+ bb_compression_bias.order(ByteOrder.LITTLE_ENDIAN);
+ }
+
+ Double compressionBias = bb_compression_bias.getDouble();
+
+ // TODO:
+ // check if this "compression bias" is being used anywhere?
+ // doesn't seem to be!
+ // -- 4.0 alpha
+ if ( compressionBias == 100d){
+ // 100 is expected
+ dbgLog.fine("compressionBias is 100 as expected");
+ ///smd.getFileInformation().put("compressionBias", 100);
+ } else {
+ dbgLog.fine("compression bias is not 100: "+ compressionBias);
+ ///smd.getFileInformation().put("compressionBias", compressionBias);
+ }
+
+
+ // 1.8 84-byte File Creation Information (date/time: dd MM yyhh:mm:ss +
+ // 64-bytelabel)
+
+ offset_start = offset_end;
+ offset_end += LENGTH_FILE_CREATION_INFO; // 84 bytes
+
+ String fileCreationInfo = getNullStrippedString(new String(Arrays.copyOfRange(recordType1, offset_start,
+ offset_end),"US-ASCII"));
+
+ dbgLog.fine("fileCreationInfo:\n"+fileCreationInfo+"\n");
+
+ String fileCreationDate = fileCreationInfo.substring(0,length_file_creation_date);
+ int dateEnd = length_file_creation_date+length_file_creation_time;
+ String fileCreationTime = fileCreationInfo.substring(length_file_creation_date,
+ (dateEnd));
+ String fileCreationNote = fileCreationInfo.substring(dateEnd,length_file_creation_label);
+
+
+ dbgLog.fine("fileDate="+ fileCreationDate);
+ dbgLog.fine("fileTime="+ fileCreationTime);
+ dbgLog.fine("fileNote"+ fileCreationNote);
+
+ // 4.0 - my comments from the DTA reader:
+ /* All these time/date stamps - I don't think we are using
+ * them anywhere. -- L.A. 4.0
+ */
+ /* As for the "varformat schema" - storing this information was
+ * largely redundant, since we know that all the variables in
+ * this data table come from a Stata file. -- L.A. 4.0
+ */
+ ///smd.getFileInformation().put("fileDate", fileCreationDate);
+ ///smd.getFileInformation().put("fileTime", fileCreationTime);
+ ///smd.getFileInformation().put("fileNote", fileCreationNote);
+ ///smd.getFileInformation().put("varFormat_schema", "SPSS");
+
+
+ /// mime type has already been set on the newly created dataTable,
+ /// earlier.
+ //smd.getFileInformation().put("mimeType", MIME_TYPE[0]);
+ //smd.getFileInformation().put("fileFormat", MIME_TYPE[0]);
+
+ ///smd.setValueLabelMappingTable(valueVariableMappingTable);
+
+
+ } catch (IOException ex) {
+ throw ex;
+ }
+
+ dbgLog.fine("decodeRecordType1(): end");
+ }
+
+
+ void decodeRecordType2(BufferedInputStream stream) throws IOException {
+ dbgLog.fine("decodeRecordType2(): start");
+ if (stream ==null){
+ throw new IllegalArgumentException("stream == null!");
+ }
+
+ MapInteger
value to
+ * SPSS POR data-format code.
+ * Note: after 17, SPSS POR and SAV formats no longer
+ * coincide.
+ */
+ public static MapSet
instance that tells whether a given format code
+ * is not a date/time type.
+ */
+ public static final SetString
array of short weekday names in English
+ */
+ public static String[] WEEKDAYS = {"Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"};
+
+ /**
+ * A String
array of short month names in English
+ */
+ public static String[] MONTHS = {"Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"};
+
+ /**
+ * A mapping table from an Integer
to
+ * a short-weekday name in English.
+ */
+ public static final MapInteger
to
+ * a short-month name in English.
+ */
+ public static final Map