diff --git a/NEWS.md b/NEWS.md index 054502fe..8a744138 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,12 @@ # haven (development version) +* Updated to ReadStat dev a4984d5. + + * Fix issue writing SAV files with labels for string values longer than 8 bytes (#550). + * Fix issue reading SAS files with zero observations (#627). + * Fix issue writing XPT V8 datasets with long variable labels (#784). + * FIx issue reading SAV files where an MR set name contains a non-ASCII character (#788). + * `col_select` in the `read_*()` functions now correctly implements the tidyselect interface. Columns will be returned in the order specified in `col_select` and can be renamed, e.g. `col_select = c(new = old)` (#685). @@ -39,7 +46,7 @@ # haven 2.5.5 -* Updated ReadStat to fix stricter gcc diagnostics. +* Updated to ReadStat dev b2d5407 to fix stricter gcc diagnostics. # haven 2.5.4 diff --git a/src/readstat/readstat_error.c b/src/readstat/readstat_error.c index d5d84ab3..5d0739d5 100644 --- a/src/readstat/readstat_error.c +++ b/src/readstat/readstat_error.c @@ -122,5 +122,8 @@ const char *readstat_error_message(readstat_error_t error_code) { if (error_code == READSTAT_ERROR_BAD_TIMESTAMP_VALUE) return "The provided file timestamp is invalid"; + if (error_code == READSTAT_ERROR_BAD_MR_STRING) + return "A multi-response set record is invalid"; + return "Unknown error"; } diff --git a/src/readstat/sas/readstat_sas7bdat_read.c b/src/readstat/sas/readstat_sas7bdat_read.c index 20c76a8b..b7f02965 100644 --- a/src/readstat/sas/readstat_sas7bdat_read.c +++ b/src/readstat/sas/readstat_sas7bdat_read.c @@ -667,7 +667,7 @@ static readstat_error_t sas7bdat_validate_column(col_info_t *col_info) { } } if (col_info->type == READSTAT_TYPE_STRING) { - if (col_info->width > INT16_MAX || col_info->width == 0) { + if (col_info->width > INT16_MAX) { return READSTAT_ERROR_PARSE; } } diff --git a/src/readstat/sas/readstat_xport_write.c b/src/readstat/sas/readstat_xport_write.c index 1f7b317b..2383360d 100644 --- a/src/readstat/sas/readstat_xport_write.c +++ b/src/readstat/sas/readstat_xport_write.c @@ -49,12 +49,21 @@ static readstat_error_t xport_write_record(readstat_writer_t *writer, const char return retval; } -static readstat_error_t xport_write_header_record_v8(readstat_writer_t *writer, +static readstat_error_t xport_write_header_record_obsv8(readstat_writer_t *writer, + int row_count) { + char record[RECORD_LEN+1]; + snprintf(record, sizeof(record), + "HEADER RECORD*******OBSV8 HEADER RECORD!!!!!!!" "%15d", + row_count); + return xport_write_record(writer, record); +} + +static readstat_error_t xport_write_header_record_labelv8(readstat_writer_t *writer, xport_header_record_t *xrecord) { char record[RECORD_LEN+1]; snprintf(record, sizeof(record), - "HEADER RECORD*******%-8sHEADER RECORD!!!!!!!" "%15d" "%15d", - xrecord->name, xrecord->num1, xrecord->num2); + "HEADER RECORD*******%-8sHEADER RECORD!!!!!!!" "%-5d", + xrecord->name, xrecord->num1); return xport_write_record(writer, record); } @@ -167,7 +176,7 @@ static readstat_error_t xport_write_variables(readstat_writer_t *writer) { if (any_has_long_format) { strcpy(header.name, "LABELV9"); } - retval = xport_write_header_record_v8(writer, &header); + retval = xport_write_header_record_labelv8(writer, &header); if (retval != READSTAT_OK) goto cleanup; @@ -357,11 +366,7 @@ static readstat_error_t xport_write_namestr_header_record(readstat_writer_t *wri static readstat_error_t xport_write_obs_header_record(readstat_writer_t *writer) { if (writer->version == 8) { - xport_header_record_t xrecord = { - .name = "OBSV8", - .num1 = writer->row_count - }; - return xport_write_header_record_v8(writer, &xrecord); + return xport_write_header_record_obsv8(writer, writer->row_count); } xport_header_record_t xrecord = { .name = "OBS" diff --git a/src/readstat/spss/readstat_sav_parse_mr_name.c b/src/readstat/spss/readstat_sav_parse_mr_name.c index 4bfad0c1..5125065e 100644 --- a/src/readstat/spss/readstat_sav_parse_mr_name.c +++ b/src/readstat/spss/readstat_sav_parse_mr_name.c @@ -1,66 +1,68 @@ + #line 1 "src/spss/readstat_sav_parse_mr_name.rl" #include #include #include #include "../readstat.h" #include "../readstat_malloc.h" +#include "../readstat_iconv.h" +#include "../readstat_convert.h" +#include "readstat_sav.h" -#line 8 "src/spss/readstat_sav_parse_mr_name.c" -static const signed char _mr_extractor_actions[] = { - 0, 1, 0, 1, 1, 1, 2, 1, - 3, 1, 4, 0 +#line 11 "src/spss/readstat_sav_parse_mr_name.c" +static const char _mr_extractor_actions[] = { + 0, 1, 0, 1, 1, 1, 2, 1, + 3, 1, 4 }; -static const signed char _mr_extractor_key_offsets[] = { - 0, 0, 8, 17, 19, 22, 24, 27, - 36, 48, 0 +static const char _mr_extractor_key_offsets[] = { + 0, 0, 2, 4, 6, 9, 11, 14, + 19, 24, 29, 31, 36, 41 }; static const char _mr_extractor_trans_keys[] = { - 46, 95, 48, 57, 65, 90, 97, 122, - 46, 61, 95, 48, 57, 65, 90, 97, - 122, 67, 68, 32, 48, 57, 48, 57, - 32, 48, 57, 32, 46, 95, 48, 57, - 65, 90, 97, 122, 0, 32, 46, 95, - 9, 13, 48, 57, 65, 90, 97, 122, - 46, 95, 48, 57, 65, 90, 97, 122, - 0 + 32, 61, 32, 61, 67, 68, 32, 48, + 57, 48, 57, 32, 48, 57, 0, 32, + 61, 9, 13, 0, 32, 61, 9, 13, + 0, 32, 61, 9, 13, 32, 61, 0, + 32, 61, 9, 13, 0, 32, 61, 9, + 13, 0, 32, 9, 13, 0 +}; + +static const char _mr_extractor_single_lengths[] = { + 0, 2, 2, 0, 1, 0, 1, 3, + 3, 3, 2, 3, 3, 2 }; -static const signed char _mr_extractor_single_lengths[] = { - 0, 2, 3, 0, 1, 0, 1, 3, - 4, 2, 0 +static const char _mr_extractor_range_lengths[] = { + 0, 0, 0, 1, 1, 1, 1, 1, + 1, 1, 0, 1, 1, 1 }; -static const signed char _mr_extractor_range_lengths[] = { - 0, 3, 3, 1, 1, 1, 1, 3, - 4, 3, 0 +static const char _mr_extractor_index_offsets[] = { + 0, 0, 3, 6, 8, 11, 13, 16, + 21, 26, 31, 34, 39, 44 }; -static const signed char _mr_extractor_index_offsets[] = { - 0, 0, 6, 13, 15, 18, 20, 23, - 30, 39, 0 +static const char _mr_extractor_indicies[] = { + 1, 1, 0, 1, 2, 0, 3, 1, + 4, 5, 1, 6, 1, 7, 6, 1, + 9, 10, 1, 9, 8, 11, 12, 1, + 11, 8, 9, 10, 1, 9, 8, 1, + 1, 8, 13, 14, 1, 13, 8, 11, + 15, 1, 11, 8, 15, 15, 15, 1, + 0 }; -static const signed char _mr_extractor_cond_targs[] = { - 2, 2, 2, 2, 2, 0, 2, 3, - 2, 2, 2, 2, 0, 4, 0, 5, - 4, 0, 6, 0, 7, 6, 0, 7, - 8, 8, 8, 8, 8, 0, 9, 9, - 8, 8, 9, 8, 8, 8, 0, 8, - 8, 8, 8, 8, 0, 0, 1, 2, - 3, 4, 5, 6, 7, 8, 9, 0 +static const char _mr_extractor_trans_targs[] = { + 2, 0, 3, 4, 5, 4, 6, 8, + 7, 9, 10, 11, 8, 11, 12, 13 }; -static const signed char _mr_extractor_cond_actions[] = { - 0, 0, 0, 0, 0, 0, 0, 1, - 0, 0, 0, 0, 0, 3, 0, 5, - 0, 0, 0, 0, 7, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 9, 9, - 0, 0, 9, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0 +static const char _mr_extractor_trans_actions[] = { + 0, 0, 1, 3, 5, 0, 0, 7, + 0, 9, 9, 0, 0, 9, 9, 0 }; static const int mr_extractor_start = 1; @@ -68,321 +70,306 @@ static const int mr_extractor_start = 1; static const int mr_extractor_en_main = 1; -#line 107 "src/spss/readstat_sav_parse_mr_name.rl" - - -readstat_error_t extract_mr_data(const char *line, mr_set_t *result) { - readstat_error_t retval = READSTAT_OK; - - // Variables needed for Ragel operation - int cs = 0; - char *p = (char *)line; - char *start = p; - char *pe = p + strlen(p) + 1; - - // Variables needed for passing Ragel intermediate results - char mr_type = '\0'; - int mr_counted_value = -1; - int mr_subvar_count = 0; - char **mr_subvariables = NULL; - char *mr_name = NULL; - char *mr_label = NULL; - - // Execute Ragel finite state machine (FSM) +#line 121 "src/spss/readstat_sav_parse_mr_name.rl" + +readstat_error_t extract_mr_data(const char *line, mr_set_t *result, sav_ctx_t *ctx) { + readstat_error_t retval = READSTAT_OK; + + // Variables needed for Ragel operation + int cs = 0; + char *p = (char *)line; + char *start = p; + char *pe = p + strlen(p) + 1; + + // Variables needed for passing Ragel intermediate results + char mr_type = '\0'; + int mr_counted_value = -1; + int mr_subvar_count = 0; + char **mr_subvariables = NULL; + char *mr_name = NULL; + char *mr_label = NULL; + + // Execute Ragel finite state machine (FSM) + #line 89 "src/spss/readstat_sav_parse_mr_name.c" { - cs = (int)mr_extractor_start; + cs = mr_extractor_start; } - -#line 127 "src/spss/readstat_sav_parse_mr_name.rl" - -#line 94 "src/spss/readstat_sav_parse_mr_name.c" +#line 142 "src/spss/readstat_sav_parse_mr_name.rl" + +#line 92 "src/spss/readstat_sav_parse_mr_name.c" { - int _klen; - unsigned int _trans = 0; - const char * _keys; - const signed char * _acts; - unsigned int _nacts; - _resume: {} - if ( p == pe ) - goto _out; - _keys = ( _mr_extractor_trans_keys + (_mr_extractor_key_offsets[cs])); - _trans = (unsigned int)_mr_extractor_index_offsets[cs]; - - _klen = (int)_mr_extractor_single_lengths[cs]; - if ( _klen > 0 ) { - const char *_lower = _keys; - const char *_upper = _keys + _klen - 1; - const char *_mid; - while ( 1 ) { - if ( _upper < _lower ) { - _keys += _klen; - _trans += (unsigned int)_klen; - break; - } - - _mid = _lower + ((_upper-_lower) >> 1); - if ( ( (*( p))) < (*( _mid)) ) - _upper = _mid - 1; - else if ( ( (*( p))) > (*( _mid)) ) - _lower = _mid + 1; - else { - _trans += (unsigned int)(_mid - _keys); - goto _match; - } - } - } - - _klen = (int)_mr_extractor_range_lengths[cs]; - if ( _klen > 0 ) { - const char *_lower = _keys; - const char *_upper = _keys + (_klen<<1) - 2; - const char *_mid; - while ( 1 ) { - if ( _upper < _lower ) { - _trans += (unsigned int)_klen; - break; - } - - _mid = _lower + (((_upper-_lower) >> 1) & ~1); - if ( ( (*( p))) < (*( _mid)) ) - _upper = _mid - 2; - else if ( ( (*( p))) > (*( _mid + 1)) ) - _lower = _mid + 2; - else { - _trans += (unsigned int)((_mid - _keys)>>1); - break; - } + int _klen; + unsigned int _trans; + const char *_acts; + unsigned int _nacts; + const char *_keys; + + if ( p == pe ) + goto _test_eof; + if ( cs == 0 ) + goto _out; +_resume: + _keys = _mr_extractor_trans_keys + _mr_extractor_key_offsets[cs]; + _trans = _mr_extractor_index_offsets[cs]; + + _klen = _mr_extractor_single_lengths[cs]; + if ( _klen > 0 ) { + const char *_lower = _keys; + const char *_mid; + const char *_upper = _keys + _klen - 1; + while (1) { + if ( _upper < _lower ) + break; + + _mid = _lower + ((_upper-_lower) >> 1); + if ( (*p) < *_mid ) + _upper = _mid - 1; + else if ( (*p) > *_mid ) + _lower = _mid + 1; + else { + _trans += (unsigned int)(_mid - _keys); + goto _match; } } - - _match: {} - cs = (int)_mr_extractor_cond_targs[_trans]; - - if ( _mr_extractor_cond_actions[_trans] != 0 ) { - - _acts = ( _mr_extractor_actions + (_mr_extractor_cond_actions[_trans])); - _nacts = (unsigned int)(*( _acts)); - _acts += 1; - while ( _nacts > 0 ) { - switch ( (*( _acts)) ) - { - case 0: { - { -#line 10 "src/spss/readstat_sav_parse_mr_name.rl" - - mr_name = (char *)readstat_malloc(p - start + 1); - if (mr_name == NULL) { - retval = READSTAT_ERROR_MALLOC; - goto cleanup; - } - memcpy(mr_name, start, p - start); - mr_name[p - start] = '\0'; - } - -#line 177 "src/spss/readstat_sav_parse_mr_name.c" - - break; - } - case 1: { - { -#line 20 "src/spss/readstat_sav_parse_mr_name.rl" - - mr_type = *p; - start = p + 1; - } - -#line 188 "src/spss/readstat_sav_parse_mr_name.c" - - break; - } - case 2: { - { -#line 25 "src/spss/readstat_sav_parse_mr_name.rl" - - int n_cv_digs = p - start; - char *n_dig_str = (char *)readstat_malloc(n_cv_digs + 1); - if (n_dig_str == NULL) { - retval = READSTAT_ERROR_MALLOC; - goto cleanup; - } - memcpy(n_dig_str, start, n_cv_digs); - n_dig_str[n_cv_digs] = '\0'; - int n_digs = strtol(n_dig_str, NULL, 10); - free(n_dig_str); - if (n_digs != 0) { - char *cv = (char *)readstat_malloc(n_digs + 1); - if (cv == NULL) { - retval = READSTAT_ERROR_MALLOC; - goto cleanup; - } - memcpy(cv, p + 1, n_digs); - cv[n_digs] = '\0'; - mr_counted_value = strtol(cv, NULL, 10); - free(cv); - p = p + 1 + n_digs; - start = p + 1; - } - else { - mr_counted_value = -1; - } - } - -#line 223 "src/spss/readstat_sav_parse_mr_name.c" - - break; - } - case 3: { - { -#line 54 "src/spss/readstat_sav_parse_mr_name.rl" - - char *lbl_len_str = (char *)readstat_malloc(p - start + 1); - if (lbl_len_str == NULL) { - retval = READSTAT_ERROR_MALLOC; - goto cleanup; - } - memcpy(lbl_len_str, start, p - start); - lbl_len_str[p - start] = '\0'; - int len = strtol(lbl_len_str, NULL, 10); - free(lbl_len_str); - mr_label = (char *)readstat_malloc(len + 1); - if (mr_label == NULL) { - retval = READSTAT_ERROR_MALLOC; - goto cleanup; - } - memcpy(mr_label, p + 1, len); - mr_label[len] = '\0'; - p = p + 1 + len; - start = p + 1; - } - -#line 250 "src/spss/readstat_sav_parse_mr_name.c" - - break; - } - case 4: { - { -#line 75 "src/spss/readstat_sav_parse_mr_name.rl" - - int len = p - start; - char *subvar = (char *)readstat_malloc(len + 1); - if (subvar == NULL) { - retval = READSTAT_ERROR_MALLOC; - goto cleanup; - } - memcpy(subvar, start, len); - subvar[len] = '\0'; - start = p + 1; - char **new_subvariables = readstat_realloc(mr_subvariables, sizeof(char *) * (mr_subvar_count + 1)); - if (new_subvariables == NULL) { - free(subvar); - retval = READSTAT_ERROR_MALLOC; - goto cleanup; - } - mr_subvariables = new_subvariables; - mr_subvariables[mr_subvar_count++] = subvar; - } - -#line 276 "src/spss/readstat_sav_parse_mr_name.c" - - break; - } - } - _nacts -= 1; - _acts += 1; + _keys += _klen; + _trans += _klen; + } + + _klen = _mr_extractor_range_lengths[cs]; + if ( _klen > 0 ) { + const char *_lower = _keys; + const char *_mid; + const char *_upper = _keys + (_klen<<1) - 2; + while (1) { + if ( _upper < _lower ) + break; + + _mid = _lower + (((_upper-_lower) >> 1) & ~1); + if ( (*p) < _mid[0] ) + _upper = _mid - 2; + else if ( (*p) > _mid[1] ) + _lower = _mid + 2; + else { + _trans += (unsigned int)((_mid - _keys)>>1); + goto _match; } - - } - - if ( cs != 0 ) { - p += 1; - goto _resume; } - _out: {} + _trans += _klen; } - -#line 128 "src/spss/readstat_sav_parse_mr_name.rl" - - - // Check if FSM finished successfully - if (cs < -#line 296 "src/spss/readstat_sav_parse_mr_name.c" -9 -#line 131 "src/spss/readstat_sav_parse_mr_name.rl" - || p != pe) { - retval = READSTAT_ERROR_BAD_MR_STRING; - goto cleanup; + +_match: + _trans = _mr_extractor_indicies[_trans]; + cs = _mr_extractor_trans_targs[_trans]; + + if ( _mr_extractor_trans_actions[_trans] == 0 ) + goto _again; + + _acts = _mr_extractor_actions + _mr_extractor_trans_actions[_trans]; + _nacts = (unsigned int) *_acts++; + while ( _nacts-- > 0 ) + { + switch ( *_acts++ ) + { + case 0: +#line 13 "src/spss/readstat_sav_parse_mr_name.rl" + { + size_t src_len = p - start; + size_t dst_len = 4 * src_len + 1; // UTF-8 expansion: up to 4 bytes per char + mr_name = (char *)readstat_malloc(dst_len); + if (mr_name == NULL) { + retval = READSTAT_ERROR_MALLOC; + goto cleanup; + } + retval = readstat_convert(mr_name, dst_len, start, src_len, ctx->converter); + if (retval != READSTAT_OK) { + goto cleanup; + } + } + break; + case 1: +#line 27 "src/spss/readstat_sav_parse_mr_name.rl" + { + mr_type = *p; + start = p + 1; + } + break; + case 2: +#line 32 "src/spss/readstat_sav_parse_mr_name.rl" + { + int n_cv_digs = p - start; + char *n_dig_str = (char *)readstat_malloc(n_cv_digs + 1); + if (n_dig_str == NULL) { + retval = READSTAT_ERROR_MALLOC; + goto cleanup; + } + memcpy(n_dig_str, start, n_cv_digs); + n_dig_str[n_cv_digs] = '\0'; + int n_digs = strtol(n_dig_str, NULL, 10); + free(n_dig_str); + if (n_digs != 0) { + char *cv = (char *)readstat_malloc(n_digs + 1); + if (cv == NULL) { + retval = READSTAT_ERROR_MALLOC; + goto cleanup; + } + memcpy(cv, p + 1, n_digs); + cv[n_digs] = '\0'; + mr_counted_value = strtol(cv, NULL, 10); + free(cv); + p = p + 1 + n_digs; + start = p + 1; + } + else { + mr_counted_value = -1; + } + } + break; + case 3: +#line 61 "src/spss/readstat_sav_parse_mr_name.rl" + { + char *lbl_len_str = (char *)readstat_malloc(p - start + 1); + if (lbl_len_str == NULL) { + retval = READSTAT_ERROR_MALLOC; + goto cleanup; + } + memcpy(lbl_len_str, start, p - start); + lbl_len_str[p - start] = '\0'; + int len = strtol(lbl_len_str, NULL, 10); + free(lbl_len_str); + size_t dst_len = 4 * len + 1; // UTF-8 expansion: up to 4 bytes per char + mr_label = (char *)readstat_malloc(dst_len); + if (mr_label == NULL) { + retval = READSTAT_ERROR_MALLOC; + goto cleanup; + } + retval = readstat_convert(mr_label, dst_len, p + 1, len, ctx->converter); + if (retval != READSTAT_OK) { + goto cleanup; + } + p = p + 1 + len; + start = p + 1; + } + break; + case 4: +#line 85 "src/spss/readstat_sav_parse_mr_name.rl" + { + size_t src_len = p - start; + size_t dst_len = 4 * src_len + 1; // UTF-8 expansion: up to 4 bytes per char + char *subvar = (char *)readstat_malloc(dst_len); + if (subvar == NULL) { + retval = READSTAT_ERROR_MALLOC; + goto cleanup; + } + retval = readstat_convert(subvar, dst_len, start, src_len, ctx->converter); + if (retval != READSTAT_OK) { + free(subvar); + goto cleanup; + } + start = p + 1; + char **new_subvariables = readstat_realloc(mr_subvariables, sizeof(char *) * (mr_subvar_count + 1)); + if (new_subvariables == NULL) { + free(subvar); + retval = READSTAT_ERROR_MALLOC; + goto cleanup; + } + mr_subvariables = new_subvariables; + mr_subvariables[mr_subvar_count++] = subvar; + } + break; +#line 266 "src/spss/readstat_sav_parse_mr_name.c" + } } - - (void)mr_extractor_en_main; - - // Assign parsed values to output parameter - result->name = mr_name; - result->label = mr_label; - result->type = mr_type; - result->counted_value = mr_counted_value; - result->subvariables = mr_subvariables; - result->num_subvars = mr_subvar_count; - if (result->type == 'D') { - result->is_dichotomy = 1; + +_again: + if ( cs == 0 ) + goto _out; + if ( ++p != pe ) + goto _resume; + _test_eof: {} + _out: {} } - - cleanup: - if (retval != READSTAT_OK) { - if (mr_subvariables != NULL) { - for (int i = 0; i < mr_subvar_count; i++) { - if (mr_subvariables[i] != NULL) free(mr_subvariables[i]); - } - free(mr_subvariables); - } - if (mr_name != NULL) free(mr_name); - if (mr_label != NULL) free(mr_label); - } - return retval; + +#line 143 "src/spss/readstat_sav_parse_mr_name.rl" + + // Check if FSM finished successfully + if (cs < 8 || p != pe) { + retval = READSTAT_ERROR_BAD_MR_STRING; + goto cleanup; + } + + (void)mr_extractor_en_main; + + // Assign parsed values to output parameter + result->name = mr_name; + result->label = mr_label; + result->type = mr_type; + result->counted_value = mr_counted_value; + result->subvariables = mr_subvariables; + result->num_subvars = mr_subvar_count; + if (result->type == 'D') { + result->is_dichotomy = 1; + } + +cleanup: + if (retval != READSTAT_OK) { + if (mr_subvariables != NULL) { + for (int i = 0; i < mr_subvar_count; i++) { + if (mr_subvariables[i] != NULL) free(mr_subvariables[i]); + } + free(mr_subvariables); + } + if (mr_name != NULL) free(mr_name); + if (mr_label != NULL) free(mr_label); + } + return retval; } -readstat_error_t parse_mr_line(const char *line, mr_set_t *result) { - *result = (mr_set_t){0}; - return extract_mr_data(line, result); +readstat_error_t parse_mr_line(const char *line, mr_set_t *result, sav_ctx_t *ctx) { + *result = (mr_set_t){0}; + return extract_mr_data(line, result, ctx); } -#line 335 "src/spss/readstat_sav_parse_mr_name.c" -static const signed char _mr_parser_actions[] = { - 0, 1, 0, 0 +#line 317 "src/spss/readstat_sav_parse_mr_name.c" +static const char _mr_parser_actions[] = { + 0, 1, 0 }; -static const signed char _mr_parser_key_offsets[] = { - 0, 0, 1, 2, 4, 0 +static const char _mr_parser_key_offsets[] = { + 0, 0, 1, 2, 4 }; static const char _mr_parser_trans_keys[] = { 36, 10, 0, 10, 10, 0 }; -static const signed char _mr_parser_single_lengths[] = { - 0, 1, 1, 2, 1, 0 +static const char _mr_parser_single_lengths[] = { + 0, 1, 1, 2, 1 +}; + +static const char _mr_parser_range_lengths[] = { + 0, 0, 0, 0, 0 }; -static const signed char _mr_parser_range_lengths[] = { - 0, 0, 0, 0, 0, 0 +static const char _mr_parser_index_offsets[] = { + 0, 0, 2, 4, 7 }; -static const signed char _mr_parser_index_offsets[] = { - 0, 0, 2, 4, 7, 0 +static const char _mr_parser_indicies[] = { + 0, 1, 2, 0, 3, 2, 0, 2, + 0, 0 }; -static const signed char _mr_parser_cond_targs[] = { - 2, 0, 3, 2, 4, 3, 2, 3, - 2, 0, 1, 2, 3, 4, 0 +static const char _mr_parser_trans_targs[] = { + 2, 0, 3, 4 }; -static const signed char _mr_parser_cond_actions[] = { - 0, 0, 1, 0, 0, 1, 0, 1, - 0, 0, 0, 0, 0, 0, 0 +static const char _mr_parser_trans_actions[] = { + 0, 0, 1, 0 }; static const int mr_parser_start = 1; @@ -390,157 +377,148 @@ static const int mr_parser_start = 1; static const int mr_parser_en_main = 1; -#line 202 "src/spss/readstat_sav_parse_mr_name.rl" +#line 216 "src/spss/readstat_sav_parse_mr_name.rl" -readstat_error_t parse_mr_string(const char *line, mr_set_t **mr_sets, size_t *n_mr_lines) { - readstat_error_t retval = READSTAT_OK; - int cs = 0; - char *p = (char *)line; - char *start = p; - char *pe = p + strlen(p) + 1; - *mr_sets = NULL; - *n_mr_lines = 0; - +readstat_error_t parse_mr_string(const char *line, mr_set_t **mr_sets, size_t *n_mr_lines, sav_ctx_t *ctx) { + readstat_error_t retval = READSTAT_OK; + int cs = 0; + char *p = (char *)line; + char *start = p; + char *pe = p + strlen(p) + 1; + *mr_sets = NULL; + *n_mr_lines = 0; -#line 385 "src/spss/readstat_sav_parse_mr_name.c" + +#line 369 "src/spss/readstat_sav_parse_mr_name.c" { - cs = (int)mr_parser_start; + cs = mr_parser_start; } - -#line 213 "src/spss/readstat_sav_parse_mr_name.rl" - -#line 390 "src/spss/readstat_sav_parse_mr_name.c" +#line 228 "src/spss/readstat_sav_parse_mr_name.rl" + +#line 372 "src/spss/readstat_sav_parse_mr_name.c" { - int _klen; - unsigned int _trans = 0; - const char * _keys; - const signed char * _acts; - unsigned int _nacts; - _resume: {} - if ( p == pe ) - goto _out; - _keys = ( _mr_parser_trans_keys + (_mr_parser_key_offsets[cs])); - _trans = (unsigned int)_mr_parser_index_offsets[cs]; - - _klen = (int)_mr_parser_single_lengths[cs]; - if ( _klen > 0 ) { - const char *_lower = _keys; - const char *_upper = _keys + _klen - 1; - const char *_mid; - while ( 1 ) { - if ( _upper < _lower ) { - _keys += _klen; - _trans += (unsigned int)_klen; - break; - } - - _mid = _lower + ((_upper-_lower) >> 1); - if ( ( (*( p))) < (*( _mid)) ) - _upper = _mid - 1; - else if ( ( (*( p))) > (*( _mid)) ) - _lower = _mid + 1; - else { - _trans += (unsigned int)(_mid - _keys); - goto _match; - } + int _klen; + unsigned int _trans; + const char *_acts; + unsigned int _nacts; + const char *_keys; + + if ( p == pe ) + goto _test_eof; + if ( cs == 0 ) + goto _out; +_resume: + _keys = _mr_parser_trans_keys + _mr_parser_key_offsets[cs]; + _trans = _mr_parser_index_offsets[cs]; + + _klen = _mr_parser_single_lengths[cs]; + if ( _klen > 0 ) { + const char *_lower = _keys; + const char *_mid; + const char *_upper = _keys + _klen - 1; + while (1) { + if ( _upper < _lower ) + break; + + _mid = _lower + ((_upper-_lower) >> 1); + if ( (*p) < *_mid ) + _upper = _mid - 1; + else if ( (*p) > *_mid ) + _lower = _mid + 1; + else { + _trans += (unsigned int)(_mid - _keys); + goto _match; } } - - _klen = (int)_mr_parser_range_lengths[cs]; - if ( _klen > 0 ) { - const char *_lower = _keys; - const char *_upper = _keys + (_klen<<1) - 2; - const char *_mid; - while ( 1 ) { - if ( _upper < _lower ) { - _trans += (unsigned int)_klen; - break; - } - - _mid = _lower + (((_upper-_lower) >> 1) & ~1); - if ( ( (*( p))) < (*( _mid)) ) - _upper = _mid - 2; - else if ( ( (*( p))) > (*( _mid + 1)) ) - _lower = _mid + 2; - else { - _trans += (unsigned int)((_mid - _keys)>>1); - break; - } - } - } - - _match: {} - cs = (int)_mr_parser_cond_targs[_trans]; - - if ( _mr_parser_cond_actions[_trans] != 0 ) { - - _acts = ( _mr_parser_actions + (_mr_parser_cond_actions[_trans])); - _nacts = (unsigned int)(*( _acts)); - _acts += 1; - while ( _nacts > 0 ) { - switch ( (*( _acts)) ) - { - case 0: { - { -#line 172 "src/spss/readstat_sav_parse_mr_name.rl" - - char *mln = (char *)readstat_malloc(p - start); - if (mln == NULL) { - retval = READSTAT_ERROR_MALLOC; - goto cleanup; - } - memcpy(mln, start + 1, p - start); - mln[p - start - 1] = '\0'; - mr_set_t *new_mr_sets = readstat_realloc(*mr_sets, ((*n_mr_lines) + 1) * sizeof(mr_set_t)); - if (new_mr_sets == NULL) { - free(mln); - retval = READSTAT_ERROR_MALLOC; - goto cleanup; - } - *mr_sets = new_mr_sets; - retval = parse_mr_line(mln, &(*mr_sets)[*n_mr_lines]); - free(mln); - if (retval != READSTAT_OK) { - goto cleanup; - } - (*n_mr_lines)++; - start = p + 1; - } - -#line 487 "src/spss/readstat_sav_parse_mr_name.c" - - break; - } - } - _nacts -= 1; - _acts += 1; + _keys += _klen; + _trans += _klen; + } + + _klen = _mr_parser_range_lengths[cs]; + if ( _klen > 0 ) { + const char *_lower = _keys; + const char *_mid; + const char *_upper = _keys + (_klen<<1) - 2; + while (1) { + if ( _upper < _lower ) + break; + + _mid = _lower + (((_upper-_lower) >> 1) & ~1); + if ( (*p) < _mid[0] ) + _upper = _mid - 2; + else if ( (*p) > _mid[1] ) + _lower = _mid + 2; + else { + _trans += (unsigned int)((_mid - _keys)>>1); + goto _match; } - } - - if ( cs != 0 ) { - p += 1; - goto _resume; + _trans += _klen; + } + +_match: + _trans = _mr_parser_indicies[_trans]; + cs = _mr_parser_trans_targs[_trans]; + + if ( _mr_parser_trans_actions[_trans] == 0 ) + goto _again; + + _acts = _mr_parser_actions + _mr_parser_trans_actions[_trans]; + _nacts = (unsigned int) *_acts++; + while ( _nacts-- > 0 ) + { + switch ( *_acts++ ) + { + case 0: +#line 186 "src/spss/readstat_sav_parse_mr_name.rl" + { + char *mln = (char *)readstat_malloc(p - start); + if (mln == NULL) { + retval = READSTAT_ERROR_MALLOC; + goto cleanup; + } + memcpy(mln, start + 1, p - start); + mln[p - start - 1] = '\0'; + mr_set_t *new_mr_sets = readstat_realloc(*mr_sets, ((*n_mr_lines) + 1) * sizeof(mr_set_t)); + if (new_mr_sets == NULL) { + free(mln); + retval = READSTAT_ERROR_MALLOC; + goto cleanup; + } + *mr_sets = new_mr_sets; + retval = parse_mr_line(mln, &(*mr_sets)[*n_mr_lines], ctx); + free(mln); + if (retval != READSTAT_OK) { + goto cleanup; + } + (*n_mr_lines)++; + start = p + 1; + } + break; +#line 470 "src/spss/readstat_sav_parse_mr_name.c" } - _out: {} } - -#line 214 "src/spss/readstat_sav_parse_mr_name.rl" - - if (cs < -#line 506 "src/spss/readstat_sav_parse_mr_name.c" -4 -#line 216 "src/spss/readstat_sav_parse_mr_name.rl" - || p != pe) { - retval = READSTAT_ERROR_BAD_MR_STRING; - goto cleanup; +_again: + if ( cs == 0 ) + goto _out; + if ( ++p != pe ) + goto _resume; + _test_eof: {} + _out: {} } - - (void)mr_parser_en_main; - - cleanup: - return retval; + +#line 229 "src/spss/readstat_sav_parse_mr_name.rl" + + if (cs < 4 || p != pe) { + retval = READSTAT_ERROR_BAD_MR_STRING; + goto cleanup; + } + + (void)mr_parser_en_main; + +cleanup: + return retval; } diff --git a/src/readstat/spss/readstat_sav_parse_mr_name.h b/src/readstat/spss/readstat_sav_parse_mr_name.h index 39752161..6dce3e00 100644 --- a/src/readstat/spss/readstat_sav_parse_mr_name.h +++ b/src/readstat/spss/readstat_sav_parse_mr_name.h @@ -4,6 +4,6 @@ #include "../readstat.h" #include "../readstat_malloc.h" -readstat_error_t parse_mr_string(const char *line, mr_set_t **mr_sets, size_t *n_mr_lines); +readstat_error_t parse_mr_string(const char *line, mr_set_t **mr_sets, size_t *n_mr_lines, sav_ctx_t *ctx); #endif // READSTAT_PARSE_MR_NAME_H diff --git a/src/readstat/spss/readstat_sav_parse_mr_name.rl b/src/readstat/spss/readstat_sav_parse_mr_name.rl index 817638bf..49fc93da 100644 --- a/src/readstat/spss/readstat_sav_parse_mr_name.rl +++ b/src/readstat/spss/readstat_sav_parse_mr_name.rl @@ -3,18 +3,25 @@ #include #include "../readstat.h" #include "../readstat_malloc.h" +#include "../readstat_iconv.h" +#include "../readstat_convert.h" +#include "readstat_sav.h" %%{ machine mr_extractor; action extract_mr_name { - mr_name = (char *)readstat_malloc(p - start + 1); + size_t src_len = p - start; + size_t dst_len = 4 * src_len + 1; // UTF-8 expansion: up to 4 bytes per char + mr_name = (char *)readstat_malloc(dst_len); if (mr_name == NULL) { retval = READSTAT_ERROR_MALLOC; goto cleanup; } - memcpy(mr_name, start, p - start); - mr_name[p - start] = '\0'; + retval = readstat_convert(mr_name, dst_len, start, src_len, ctx->converter); + if (retval != READSTAT_OK) { + goto cleanup; + } } action extract_mr_type { @@ -61,27 +68,34 @@ lbl_len_str[p - start] = '\0'; int len = strtol(lbl_len_str, NULL, 10); free(lbl_len_str); - mr_label = (char *)readstat_malloc(len + 1); + size_t dst_len = 4 * len + 1; // UTF-8 expansion: up to 4 bytes per char + mr_label = (char *)readstat_malloc(dst_len); if (mr_label == NULL) { retval = READSTAT_ERROR_MALLOC; goto cleanup; } - memcpy(mr_label, p + 1, len); - mr_label[len] = '\0'; + retval = readstat_convert(mr_label, dst_len, p + 1, len, ctx->converter); + if (retval != READSTAT_OK) { + goto cleanup; + } p = p + 1 + len; start = p + 1; } action extract_subvar { - int len = p - start; - char *subvar = (char *)readstat_malloc(len + 1); + size_t src_len = p - start; + size_t dst_len = 4 * src_len + 1; // UTF-8 expansion: up to 4 bytes per char + char *subvar = (char *)readstat_malloc(dst_len); if (subvar == NULL) { retval = READSTAT_ERROR_MALLOC; goto cleanup; } - memcpy(subvar, start, len); - subvar[len] = '\0'; - start = p + 1; + retval = readstat_convert(subvar, dst_len, start, src_len, ctx->converter); + if (retval != READSTAT_OK) { + free(subvar); + goto cleanup; + } + start = p + 1; char **new_subvariables = readstat_realloc(mr_subvariables, sizeof(char *) * (mr_subvar_count + 1)); if (new_subvariables == NULL) { free(subvar); @@ -92,7 +106,7 @@ mr_subvariables[mr_subvar_count++] = subvar; } - nc = (alnum | '_' | '.' ); # name character (including dots) + nc = ([^ =]); # name character (all characters except space and equals) name = nc+ '=' > extract_mr_name; type = ('C' | 'D'){1} > extract_mr_type; counted_value = digit* ' ' > extract_counted_value; @@ -101,12 +115,12 @@ end = (space | '\0'); # subvar token terminator subvariable = (nc+ end >extract_subvar); - main := name type counted_value label subvariable+; + main := name type counted_value label (subvariable+ | end*); write data nofinal noerror; }%% -readstat_error_t extract_mr_data(const char *line, mr_set_t *result) { +readstat_error_t extract_mr_data(const char *line, mr_set_t *result, sav_ctx_t *ctx) { readstat_error_t retval = READSTAT_OK; // Variables needed for Ragel operation @@ -161,9 +175,9 @@ cleanup: } -readstat_error_t parse_mr_line(const char *line, mr_set_t *result) { +readstat_error_t parse_mr_line(const char *line, mr_set_t *result, sav_ctx_t *ctx) { *result = (mr_set_t){0}; - return extract_mr_data(line, result); + return extract_mr_data(line, result, ctx); } %%{ @@ -184,7 +198,7 @@ readstat_error_t parse_mr_line(const char *line, mr_set_t *result) { goto cleanup; } *mr_sets = new_mr_sets; - retval = parse_mr_line(mln, &(*mr_sets)[*n_mr_lines]); + retval = parse_mr_line(mln, &(*mr_sets)[*n_mr_lines], ctx); free(mln); if (retval != READSTAT_OK) { goto cleanup; @@ -201,7 +215,7 @@ readstat_error_t parse_mr_line(const char *line, mr_set_t *result) { write data nofinal noerror; }%% -readstat_error_t parse_mr_string(const char *line, mr_set_t **mr_sets, size_t *n_mr_lines) { +readstat_error_t parse_mr_string(const char *line, mr_set_t **mr_sets, size_t *n_mr_lines, sav_ctx_t *ctx) { readstat_error_t retval = READSTAT_OK; int cs = 0; char *p = (char *)line; diff --git a/src/readstat/spss/readstat_sav_read.c b/src/readstat/spss/readstat_sav_read.c index 731d8add..5810f1ff 100644 --- a/src/readstat/spss/readstat_sav_read.c +++ b/src/readstat/spss/readstat_sav_read.c @@ -167,7 +167,7 @@ static readstat_error_t sav_read_multiple_response_sets(size_t data_len, sav_ctx goto cleanup; } - retval = parse_mr_string(mr_string, &ctx->mr_sets, &ctx->multiple_response_sets_length); + retval = parse_mr_string(mr_string, &ctx->mr_sets, &ctx->multiple_response_sets_length, ctx); cleanup: free(mr_string); diff --git a/src/readstat/spss/readstat_sav_write.c b/src/readstat/spss/readstat_sav_write.c index 4355a7aa..0e16a924 100644 --- a/src/readstat/spss/readstat_sav_write.c +++ b/src/readstat/spss/readstat_sav_write.c @@ -913,6 +913,7 @@ static readstat_error_t sav_emit_long_string_value_labels_record(readstat_writer for (k=0; kname); + int32_t user_width = r_variable->user_width; int32_t storage_width = readstat_variable_get_storage_width(r_variable); if (storage_width <= 8) continue; @@ -928,8 +929,13 @@ static readstat_error_t sav_emit_long_string_value_labels_record(readstat_writer if (label_len > MAX_VALUE_LABEL_SIZE) label_len = MAX_VALUE_LABEL_SIZE; + if (r_value_label->string_key_len > user_width) { + retval = READSTAT_ERROR_STRING_VALUE_IS_TOO_LONG; + goto cleanup; + } + info_header.count += sizeof(int32_t); // value length - info_header.count += storage_width; + info_header.count += user_width; info_header.count += sizeof(int32_t); // label length info_header.count += label_len; } @@ -954,12 +960,13 @@ static readstat_error_t sav_emit_long_string_value_labels_record(readstat_writer for (k=0; kname); + int32_t user_width = r_variable->user_width; int32_t storage_width = readstat_variable_get_storage_width(r_variable); if (storage_width <= 8) continue; - space_buffer = realloc(space_buffer, storage_width); - memset(space_buffer, ' ', storage_width); + space_buffer = realloc(space_buffer, user_width); + memset(space_buffer, ' ', user_width); retval = readstat_write_bytes(writer, &name_len, sizeof(int32_t)); if (retval != READSTAT_OK) @@ -969,7 +976,7 @@ static readstat_error_t sav_emit_long_string_value_labels_record(readstat_writer if (retval != READSTAT_OK) goto cleanup; - retval = readstat_write_bytes(writer, &storage_width, sizeof(int32_t)); + retval = readstat_write_bytes(writer, &user_width, sizeof(int32_t)); if (retval != READSTAT_OK) goto cleanup; @@ -984,7 +991,7 @@ static readstat_error_t sav_emit_long_string_value_labels_record(readstat_writer if (label_len > MAX_VALUE_LABEL_SIZE) label_len = MAX_VALUE_LABEL_SIZE; - retval = readstat_write_bytes(writer, &storage_width, sizeof(int32_t)); + retval = readstat_write_bytes(writer, &user_width, sizeof(int32_t)); if (retval != READSTAT_OK) goto cleanup; @@ -992,8 +999,8 @@ static readstat_error_t sav_emit_long_string_value_labels_record(readstat_writer if (retval != READSTAT_OK) goto cleanup; - if (value_len < storage_width) { - retval = readstat_write_bytes(writer, space_buffer, storage_width - value_len); + if (value_len < user_width) { + retval = readstat_write_bytes(writer, space_buffer, user_width - value_len); if (retval != READSTAT_OK) goto cleanup; }