Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,8 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T

27. `dogroups()` no longer reads beyond the resized end of over-allocated data.table list columns, [#7486](https://github.com/Rdatatable/data.table/issues/7486). While this didn't crash in practice, it is now explicitly checked for in recent R versions (r89198+). Thanks @TimTaylor and @aitap for the report and @aitap for the fix.

28. `fread()` with `col.names=` now correctly applies `drop=` and `select=` using user-provided column names instead of auto-generated names, [#3459](https://github.com/Rdatatable/data.table/issues/3459). Thanks to @malcook for the report and @ben-schwen for the fix.

### NOTES

1. The following in-progress deprecations have proceeded:
Expand Down
6 changes: 2 additions & 4 deletions R/fread.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ fread = function(
input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec="auto", quote="\"", nrows=Inf, header="auto",
na.strings=getOption("datatable.na.strings","NA"), stringsAsFactors=FALSE, verbose=getOption("datatable.verbose",FALSE),
skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64","integer64"),
col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, comment.char="", key=NULL, index=NULL,
col.names=NULL, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, comment.char="", key=NULL, index=NULL,
showProgress=getOption("datatable.showProgress",interactive()), data.table=getOption("datatable.fread.datatable",TRUE),
nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01",FALSE),
logicalYN=getOption("datatable.logicalYN", FALSE),
Expand Down Expand Up @@ -293,7 +293,7 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC")
tz="UTC"
}
ans = .Call(CfreadR,input,identical(input,file),sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip,comment.char,
fill,showProgress,nThread,verbose,warnings2errors,logical01,logicalYN,select,drop,colClasses,integer64,encoding,keepLeadingZeros,tz=="UTC")
fill,showProgress,nThread,verbose,warnings2errors,logical01,logicalYN,select,drop,colClasses,integer64,encoding,keepLeadingZeros,tz=="UTC",col.names)
if (!length(ans)) return(null.data.table()) # test 1743.308 drops all columns
nr = length(ans[[1L]])
require_bit64_if_needed(ans)
Expand Down Expand Up @@ -356,8 +356,6 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC")
for (j in cols_to_factor) set(ans, j=j, value=as_factor(.subset2(ans, j)))
}

if (!missing(col.names)) # FR #768
setnames(ans, col.names) # setnames checks and errors automatically
if (!is.null(key) && data.table) {
if (!is.character(key))
stopf("key argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)")
Expand Down
12 changes: 11 additions & 1 deletion inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -8072,7 +8072,7 @@ test(1557.1, names(fread(str)), c("V1", "V2")) # autonamed
test(1557.2, names(fread(str, col.names=letters[1:2])), letters[1:2])
test(1557.3, names(fread(str, col.names=letters[1])), error="Can't assign 1 names to")
test(1557.4, names(fread(str, col.names=letters[1:3])), error="Can't assign 3 names to")
test(1557.5, names(fread(str, col.names=1:2)), error="Passed a vector of type")
test(1557.5, names(fread(str, col.names=1:2)), error="'col.names' must be")

# Fix for #773
f = testDir("issue_773_fread.txt")
Expand Down Expand Up @@ -21901,3 +21901,13 @@ rm(DT, strings)
DT = unserialize(serialize(as.data.table(mtcars), NULL))
test(2351, DT[,carb:=NULL], as.data.table(mtcars)[,carb:=NULL])
rm(DT)

# drop works with user override colnames #3459
DT = data.table(a=c(1L, 4L), c=c(3L, 6L))
test(2352.1, fread("a,b,c\n1,2,3\n4,5,6", drop='b'), DT)
test(2352.2, fread(col.names=c("a","b","c"),"1,2,3\n4,5,6", drop='b'), DT)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what happens here?

fread(text = "
a,b,c
1,2,3
4,5,6
", col.names=c("b", "c", "d"), drop='b')

Copy link
Copy Markdown
Member Author

@ben-schwen ben-schwen Dec 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Questionable! But current implementation favors header in read data > renaming col.names header.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think either can be justified; I might find it more natural to prefer col.names.

either way we'll want to document it in ?fread and with a regression test.

cc @Rdatatable/committers to get consensus on the preferred approach.

test(2352.3, fread(col.names=c("a","b","c"),"1,2,3\n4,5,6", drop=2), DT)
test(2352.4, fread(col.names=c("a","b","c"),"1,2,3\n4,5,6", colClasses=c(b="NULL")), DT)
test(2352.5, fread(col.names=c("a","b","c"),"1,2,3\n4,5,6", select=c('a','c')), DT)
test(2352.6, fread(col.names=c("a","b","c"),"1,2,3\n4,5,6", select=c(1,3)), DT)
rm(DT)
2 changes: 1 addition & 1 deletion src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ SEXP setcharvec(SEXP, SEXP, SEXP);
SEXP chmatch_R(SEXP, SEXP, SEXP);
SEXP chmatchdup_R(SEXP, SEXP, SEXP);
SEXP chin_R(SEXP, SEXP);
SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
SEXP fwriteR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
SEXP rbindlist(SEXP, SEXP, SEXP, SEXP, SEXP);
SEXP setlistelt(SEXP, SEXP, SEXP);
Expand Down
40 changes: 34 additions & 6 deletions src/freadR.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ static colType readInt64As = CT_INT64;
static SEXP selectSxp;
static SEXP dropSxp;
static SEXP colClassesSxp;
static SEXP cNamesSxp;
static bool selectColClasses = false;
cetype_t ienc = CE_NATIVE;
static SEXP RCHK;
Expand Down Expand Up @@ -77,7 +78,8 @@ SEXP freadR(
SEXP integer64Arg,
SEXP encodingArg,
SEXP keepLeadingZerosArgs,
SEXP noTZasUTC
SEXP noTZasUTC,
SEXP colNamesArg
)
{
verbose = LOGICAL(verboseArg)[0];
Expand Down Expand Up @@ -185,6 +187,8 @@ SEXP freadR(
args.readInt64As = readInt64As;

colClassesSxp = colClassesArg;
if (!isNull(colNamesArg) && !isString(colNamesArg)) error(_("'col.names' must be NULL or a character vector"));
cNamesSxp = colNamesArg;

selectSxp = selectArg;
dropSxp = dropArg;
Expand Down Expand Up @@ -231,10 +235,10 @@ SEXP freadR(
return DT;
}

static void applyDrop(SEXP items, int8_t *type, int ncol, int dropSource)
static void applyDrop(SEXP items, int8_t *type, int ncol, int dropSource, SEXP matchNames)
{
if (!length(items)) return;
const SEXP itemsInt = PROTECT(isString(items) ? chmatch(items, colNamesSxp, NA_INTEGER) : coerceVector(items, INTSXP));
const SEXP itemsInt = PROTECT(isString(items) ? chmatch(items, matchNames, NA_INTEGER) : coerceVector(items, INTSXP));
const int* const itemsD = INTEGER(itemsInt);
const int n = LENGTH(itemsInt);
for (int j = 0; j < n; j++) {
Expand Down Expand Up @@ -277,13 +281,15 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int
}
}
// "use either select= or drop= but not both" was checked earlier in freadR
applyDrop(dropSxp, type, ncol, /*dropSource=*/-1);
bool hasHeader = ncol > 0 && strcmp(CHAR(STRING_ELT(colNamesSxp, 0)), "V1") != 0;
SEXP matchNames = (isNull(cNamesSxp) || hasHeader) ? colNamesSxp : cNamesSxp;
applyDrop(dropSxp, type, ncol, /*dropSource=*/-1, matchNames);
if (TYPEOF(colClassesSxp) == VECSXP) { // not isNewList() because that returns true for NULL
SEXP listNames = PROTECT(getAttrib(colClassesSxp, R_NamesSymbol)); // rchk wanted this protected
for (int i = 0; i < LENGTH(colClassesSxp); i++) {
if (STRING_ELT(listNames, i) == char_NULL) {
SEXP items = VECTOR_ELT(colClassesSxp, i);
applyDrop(items, type, ncol, /*dropSource=*/i);
applyDrop(items, type, ncol, /*dropSource=*/i, matchNames);
}
}
UNPROTECT(1); // listNames
Expand All @@ -294,7 +300,7 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int
if (length(selectSxp)) {
const int n = length(selectSxp);
if (isString(selectSxp)) {
selectInts = INTEGER(PROTECT(chmatch(selectSxp, colNamesSxp, NA_INTEGER))); nprotect++;
selectInts = INTEGER(PROTECT(chmatch(selectSxp, matchNames, NA_INTEGER))); nprotect++;
for (int i = 0; i < n; i++) if (selectInts[i] == NA_INTEGER)
DTWARN(_("Column name '%s' not found in column name header (case sensitive), skipping."), CHAR(STRING_ELT(selectSxp, i)));
} else {
Expand All @@ -318,6 +324,28 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int
else type[i] = CT_DROP;
}
}
// override col names if user provided them
if (!isNull(cNamesSxp)) {
int ncnames = LENGTH(cNamesSxp);
if (hasHeader) {
int ndrop = 0;
for (int i = 0; i < ncol; i++) if (type[i] == CT_DROP) ndrop++;
if (ncnames != (ncol - ndrop)) STOP(_("Can't assign %d names to a %d-column data.table"), ncnames, ncol - ndrop);
int *selectRankD = selectRank ? INTEGER(selectRank) : NULL;
int cname_idx = 0;
for (int i = 0; i < ncol; i++) {
if (type[i] != CT_DROP) {
const int idx = selectRankD ? (selectRankD[i] - 1) : cname_idx++;
SET_STRING_ELT(colNamesSxp, i, STRING_ELT(cNamesSxp, idx));
}
}
} else {
if (ncnames != ncol) STOP(_("Can't assign %d names to a %d-column data.table"), ncnames, ncol);
for (int i = 0; i < ncol; i++) {
SET_STRING_ELT(colNamesSxp, i, STRING_ELT(cNamesSxp, i));
}
}
}
colClassesAs = NULL; // any coercions we can't handle here in C are deferred to R (to handle with methods::as) via this attribute
if (length(colClassesSxp)) {
SEXP typeRName_sxp = PROTECT(allocVector(STRSXP, NUT));
Expand Down
Loading