Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SAS dataset label #213

Merged
merged 19 commits into from
Sep 25, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion src/bin/readstat.c
Original file line number Diff line number Diff line change
Expand Up @@ -424,15 +424,19 @@ size_t readstat_strftime(char *s, size_t maxsize, const char *format, time_t tim
static int dump_metadata(readstat_metadata_t *metadata, void *ctx) {
printf("Columns: %d\n", readstat_get_var_count(metadata));
printf("Rows: %d\n", readstat_get_row_count(metadata));
const char *table_name = readstat_get_table_name(metadata);
const char *file_label = readstat_get_file_label(metadata);
const char *orig_encoding = readstat_get_file_encoding(metadata);
long version = readstat_get_file_format_version(metadata);
time_t timestamp = readstat_get_creation_time(metadata);
readstat_compress_t compression = readstat_get_compression(metadata);
readstat_endian_t endianness = readstat_get_endianness(metadata);

if (table_name && table_name[0]) {
printf("Table name: %s\n", table_name);
}
if (file_label && file_label[0]) {
printf("File label: %s\n", file_label);
printf("Table label: %s\n", file_label);
}
if (version) {
printf("Format version: %ld\n", version);
Expand Down
2 changes: 1 addition & 1 deletion src/readstat.h
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,7 @@ typedef struct readstat_writer_s {

int row_count;
int current_row;
char file_label[100];
char file_label[257];
char table_name[33];
const readstat_variable_t *fweight_variable;

Expand Down
18 changes: 9 additions & 9 deletions src/sas/readstat_sas.c
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ readstat_error_t sas_read_header(readstat_io_t *io, sas_header_info_t *hinfo,
retval = READSTAT_ERROR_UNSUPPORTED_CHARSET;
goto cleanup;
}
memcpy(hinfo->file_label, header_start.file_label, sizeof(header_start.file_label));
memcpy(hinfo->table_name, header_start.table_name, sizeof(header_start.table_name));
if (io->seek(hinfo->pad1, READSTAT_SEEK_CUR, io->io_ctx) == -1) {
retval = READSTAT_ERROR_SEEK;
goto cleanup;
Expand Down Expand Up @@ -317,16 +317,16 @@ readstat_error_t sas_write_header(readstat_writer_t *writer, sas_header_info_t *
struct tm epoch_tm = { .tm_year = 60, .tm_mday = 1 };
time_t epoch = mktime(&epoch_tm);

memset(header_start.file_label, ' ', sizeof(header_start.file_label));
memset(header_start.table_name, ' ', sizeof(header_start.table_name));

size_t file_label_len = strlen(writer->file_label);
if (file_label_len > sizeof(header_start.file_label))
file_label_len = sizeof(header_start.file_label);
size_t table_name_len = strlen(writer->table_name);
if (table_name_len > sizeof(header_start.table_name))
table_name_len = sizeof(header_start.table_name);

if (file_label_len) {
memcpy(header_start.file_label, writer->file_label, file_label_len);
if (table_name_len) {
memcpy(header_start.table_name, writer->table_name, table_name_len);
} else {
memcpy(header_start.file_label, "DATASET", sizeof("DATASET")-1);
memcpy(header_start.table_name, "DATASET", sizeof("DATASET")-1);
}

retval = readstat_write_bytes(writer, &header_start, sizeof(sas_header_start_t));
Expand Down Expand Up @@ -379,7 +379,7 @@ readstat_error_t sas_write_header(readstat_writer_t *writer, sas_header_info_t *
goto cleanup;

sas_header_end_t header_end = {
.host = "W32_VSPRO"
.host = "9.0401M6Linux"
};

char release[sizeof(header_end.release)+1] = { 0 };
Expand Down
6 changes: 4 additions & 2 deletions src/sas/readstat_sas.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ typedef struct sas_header_start_s {
unsigned char encoding;
unsigned char mystery5[13];
char file_type[8];
char file_label[64];
char table_name[32];
unsigned char mystery6[32];
char file_info[8];
} sas_header_start_t;

Expand Down Expand Up @@ -47,7 +48,8 @@ typedef struct sas_header_info_s {
int64_t header_size;
time_t creation_time;
time_t modification_time;
char file_label[64];
char table_name[32];
char file_label[256];
char *encoding;
} sas_header_info_t;

Expand Down
8 changes: 4 additions & 4 deletions src/sas/readstat_sas7bcat_read.c
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,7 @@ readstat_error_t readstat_parse_sas7bcat(readstat_parser_t *parser, const char *
}

if (ctx->metadata_handler) {
char file_label[4*64+1];
char table_name[4*32+1];
readstat_metadata_t metadata = {
.file_encoding = ctx->input_encoding, /* orig encoding? */
.modified_time = hinfo->modification_time,
Expand All @@ -424,12 +424,12 @@ readstat_error_t readstat_parse_sas7bcat(readstat_parser_t *parser, const char *
.endianness = hinfo->little_endian ? READSTAT_ENDIAN_LITTLE : READSTAT_ENDIAN_BIG,
.is64bit = ctx->u64
};
retval = readstat_convert(file_label, sizeof(file_label),
hinfo->file_label, sizeof(hinfo->file_label), ctx->converter);
retval = readstat_convert(table_name, sizeof(table_name),
hinfo->table_name, sizeof(hinfo->table_name), ctx->converter);
if (retval != READSTAT_OK)
goto cleanup;

metadata.file_label = file_label;
metadata.table_name = table_name;

if (ctx->metadata_handler(&metadata, ctx->user_ctx) != READSTAT_HANDLER_OK) {
retval = READSTAT_ERROR_USER_ABORT;
Expand Down
42 changes: 39 additions & 3 deletions src/sas/readstat_sas7bdat_read.c
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ typedef struct sas7bdat_ctx_s {
time_t ctime;
time_t mtime;
int version;
char file_label[4*64+1];
char table_name[4*32+1];
char file_label[4*256+1];
char error_buf[2048];

unsigned int rdc_compression:1;
Expand Down Expand Up @@ -285,6 +286,7 @@ static readstat_error_t sas7bdat_parse_column_name_subheader(const char *subhead
int i;
const char *cnp = &subheader[signature_len+8];
uint16_t remainder = sas_read2(&subheader[signature_len], ctx->bswap);
int off;

if (remainder != sas_subheader_remainder(len, signature_len)) {
retval = READSTAT_ERROR_PARSE;
Expand All @@ -298,6 +300,39 @@ static readstat_error_t sas7bdat_parse_column_name_subheader(const char *subhead

for (i=ctx->col_names_count-cmax; i<ctx->col_names_count; i++) {
ctx->col_info[i].name_ref = sas7bdat_parse_text_ref(cnp, ctx);
if (i == 0) {
if (ctx->text_blobs == NULL || ctx->text_blob_lengths == NULL) {
retval = READSTAT_ERROR_PARSE;
goto cleanup;
}
if (ctx->version < 9) {
off = 36;
} else {
if (ctx->text_blob_lengths[0] < 19) {
retval = READSTAT_ERROR_PARSE;
goto cleanup;
}
if (!memcmp(&ctx->text_blobs[0][12], "SASYZCR", 7)) {
off = 44;
} else {
off = ctx->u64 ? 36 : 12;
}
}
if (ctx->col_info[0].name_ref.offset >=
ctx->text_blob_lengths[0] ||
ctx->col_info[0].name_ref.offset < off) {
retval = READSTAT_ERROR_PARSE;
goto cleanup;
}
retval = readstat_convert(ctx->file_label,
sizeof(ctx->file_label),
&ctx->text_blobs[0][off],
ctx->col_info[0].name_ref.offset - off,
ctx->converter
);
if (retval != READSTAT_OK)
goto cleanup;
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think additional pointer validation is necessary here. We need to ensure:

  • The first memcmp does not overrun text_blob_lengths[0]
  • off is not larger than ctx->col_info[0].name_ref.offset
  • ctx->file_label is not overrun during the memcpy

Additionally: file_label likely needs to be recoded. So I suggest using readstat_convert here instead of the memcpy. That will automatically solve the third issue. Something like

readstat_convert(ctx->file_label, sizeof(ctx->file_label),
    &ctx->text_blobs[0][off], ctx->col_info[0].name_ref.offset - off, ctx->converter)

Then check the return value.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be addressed with the last commit

cnp += 8;
}

Expand Down Expand Up @@ -712,6 +747,7 @@ static readstat_error_t sas7bdat_submit_columns(sas7bdat_ctx_t *ctx, int compres
readstat_metadata_t metadata = {
.row_count = ctx->row_limit,
.var_count = ctx->column_count,
.table_name = ctx->table_name,
.file_label = ctx->file_label,
.file_encoding = ctx->input_encoding, /* orig encoding? */
.creation_time = ctx->ctime,
Expand Down Expand Up @@ -1219,8 +1255,8 @@ readstat_error_t readstat_parse_sas7bdat(readstat_parser_t *parser, const char *
ctx->converter = converter;
}

if ((retval = readstat_convert(ctx->file_label, sizeof(ctx->file_label),
hinfo->file_label, sizeof(hinfo->file_label), ctx->converter)) != READSTAT_OK) {
if ((retval = readstat_convert(ctx->table_name, sizeof(ctx->table_name),
hinfo->table_name, sizeof(hinfo->table_name), ctx->converter)) != READSTAT_OK) {
goto cleanup;
}

Expand Down
7 changes: 5 additions & 2 deletions src/sas/readstat_sas7bdat_write.c
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ static readstat_error_t sas7bdat_emit_header(readstat_writer_t *writer, sas_head
.file_format = SAS_FILE_FORMAT_UNIX,
.encoding = 20, /* UTF-8 */
.file_type = "SAS FILE",
.file_info = "DATA ~ ~"
.file_info = "DATA "
};

memcpy(&header_start.magic, sas7bdat_magic_number, sizeof(header_start.magic));
Expand Down Expand Up @@ -218,13 +218,16 @@ static sas7bdat_subheader_t *sas7bdat_col_name_subheader_init(readstat_writer_t
sas7bdat_subheader_t *subheader = sas7bdat_subheader_init(
SAS_SUBHEADER_SIGNATURE_COLUMN_NAME, len);
memcpy(&subheader->data[signature_len], &remainder, sizeof(uint16_t));

sas_text_ref_t text_ref = sas7bdat_make_text_ref(column_text_array, "READSTAT");
text_ref = sas7bdat_make_text_ref(column_text_array, writer->file_label);

int i;
char *ptrs = &subheader->data[signature_len+8];
for (i=0; i<writer->variables_count; i++) {
readstat_variable_t *variable = readstat_get_variable(writer, i);
const char *name = readstat_variable_get_name(variable);
sas_text_ref_t text_ref = sas7bdat_make_text_ref(column_text_array, name);
text_ref = sas7bdat_make_text_ref(column_text_array, name);
memcpy(&ptrs[0], &text_ref.index, sizeof(uint16_t));
memcpy(&ptrs[2], &text_ref.offset, sizeof(uint16_t));
memcpy(&ptrs[4], &text_ref.length, sizeof(uint16_t));
Expand Down
2 changes: 1 addition & 1 deletion src/sas/readstat_xport_read.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ typedef struct xport_ctx_s {
int row_offset;
size_t row_length;
int parsed_row_count;
char file_label[40*4+1];
char file_label[256*4+1];
char table_name[32*4+1];

readstat_variable_t **variables;
Expand Down
4 changes: 3 additions & 1 deletion src/test/test_read.c
Original file line number Diff line number Diff line change
Expand Up @@ -77,14 +77,16 @@ void parse_ctx_reset(rt_parse_ctx_t *parse_ctx, long file_format) {
} else if ((file_format & RT_FORMAT_SAV)) {
parse_ctx->max_file_label_len = 64;
} else if ((file_format & RT_FORMAT_SAS7BDAT)) {
parse_ctx->max_file_label_len = 64;
parse_ctx->max_table_name_len = 32;
parse_ctx->max_file_label_len = 256;
} else {
parse_ctx->max_file_label_len = 20;
}
if ((file_format & RT_FORMAT_XPORT_5)) {
parse_ctx->max_table_name_len = 8;
} else if ((file_format & RT_FORMAT_XPORT_8)) {
parse_ctx->max_table_name_len = 32;
parse_ctx->max_file_label_len = 256;
}
parse_ctx->var_index = -1;
parse_ctx->obs_index = -1;
Expand Down
4 changes: 3 additions & 1 deletion src/test/test_write.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ readstat_error_t write_file_to_buffer(rt_test_file_t *file, rt_buffer_t *buffer,

readstat_writer_t *writer = readstat_writer_init();
readstat_set_data_writer(writer, &write_data);
readstat_writer_set_file_label(writer, file->label);
if ((format & RT_FORMAT_SAS7BCAT)) {
strncpy(file->label, "", 1);
} else readstat_writer_set_file_label(writer, file->label);
readstat_writer_set_table_name(writer, file->table_name);
readstat_writer_set_error_handler(writer, &handle_error);
if (file->timestamp.tm_year) {
Expand Down