Skip to content

Support variable length whitespace delimiter in fread, as read.table does #785

@renqian

Description

@renqian

I have hundreds of txt delim tables like this:

https://gist.github.com/renqian/d81cb48c686cf5376800

However, while read.table properly handles reading the content with sep="\t", data.table::fread produces a data.table with wrongly-typed columns, some of them coerced to character due to white-space characters 0x20 while they truly are numeric or integer when such white-spaces are properly trimmed.

> df <- read.table("Z:/IF_TICK/IFL1/20100416-20100630.txt",header = T,sep = "\t")
> colnames(df)
 [1] "IFLxID"         "IFLxName"       "Ifcd"           "Tdate"          "Ttime"          "UpdateMillisec" "Cp"            
 [8] "Chg"            "ChgPct"         "Cq"             "Cm"             "Oc"             "S5"             "S4"            
[15] "S3"             "S2"             "S1"             "B1"             "B2"             "B3"             "B4"            
[22] "B5"             "Sv5"            "Sv4"            "Sv3"            "Sv2"            "Sv1"            "Bv1"           
[29] "Bv2"            "Bv3"            "Bv4"            "Bv5"            "BS"             "Bsratio"        "PreClosePrc"   
[36] "OpenPrc"        "Hp"             "Lp"             "ClosePrc"       "UpperLmtPrc"    "LowerLmtPrc"    "Tq"            
[43] "Tm"             "PreOpnIntrst"   "OpnIntrst"      "PreStlmtPrc"    "StlmtPrc"       "PreDelta"       "Delta"         
[50] "SettleGroupID"  "SettleID"      
> sapply(df,class)
        IFLxID       IFLxName           Ifcd          Tdate          Ttime UpdateMillisec             Cp            Chg 
      "factor"       "factor"       "factor"      "integer"      "integer"      "numeric"      "numeric"      "numeric" 
        ChgPct             Cq             Cm             Oc             S5             S4             S3             S2 
     "numeric"      "numeric"      "numeric"       "factor"      "numeric"      "numeric"      "numeric"      "numeric" 
            S1             B1             B2             B3             B4             B5            Sv5            Sv4 
     "numeric"      "numeric"      "numeric"      "numeric"      "numeric"      "numeric"      "numeric"      "numeric" 
           Sv3            Sv2            Sv1            Bv1            Bv2            Bv3            Bv4            Bv5 
     "numeric"      "numeric"      "numeric"      "numeric"      "numeric"      "numeric"      "numeric"      "numeric" 
            BS        Bsratio    PreClosePrc        OpenPrc             Hp             Lp       ClosePrc    UpperLmtPrc 
      "factor"      "numeric"      "numeric"      "numeric"      "numeric"      "numeric"      "numeric"      "numeric" 
   LowerLmtPrc             Tq             Tm   PreOpnIntrst      OpnIntrst    PreStlmtPrc       StlmtPrc       PreDelta 
     "numeric"      "numeric"      "numeric"      "numeric"      "numeric"      "numeric"      "numeric"      "numeric" 
         Delta  SettleGroupID       SettleID 
     "numeric"      "logical"      "numeric" 
> df$UpdateMillisec[1]
[1] 500

The results produced by data.table::fread is as follows:

> dt <- data.table::fread("Z:/IF_TICK/IFL1/20100416-20100630.txt",header = T)
Read 100000 rows and 51 (of 51) columns from 0.041 GB file in 00:00:10
> colnames(dt)
 [1] "IFLxID"          "IFLxName"        "Ifcd  "          "Tdate   "        "Ttime "          "UpdateMillisec" 
 [7] "Cp     "         "Chg    "         "ChgPct"          "Cq "             "Cm           "   "Oc  "           
[13] "S5     "         "S4     "         "S3     "         "S2     "         "S1     "         "B1     "        
[19] "B2     "         "B3     "         "B4     "         "B5     "         "Sv5   "          "Sv4   "         
[25] "Sv3   "          "Sv2   "          "Sv1   "          "Bv1   "          "Bv2   "          "Bv3   "         
[31] "Bv4   "          "Bv5   "          "BS"              "Bsratio"         "PreClosePrc"     "OpenPrc"        
[37] "Hp     "         "Lp     "         "ClosePrc"        "UpperLmtPrc"     "LowerLmtPrc"     "Tq    "         
[43] "Tm             " "PreOpnIntrst"    "OpnIntrst"       "PreStlmtPrc"     "StlmtPrc"        "PreDelta"       
[49] "Delta"           "SettleGroupID"   "SettleID"       
> sapply(dt,class)
         IFLxID        IFLxName          Ifcd          Tdate             Ttime   UpdateMillisec         Cp      
    "character"     "character"     "character"       "integer"       "integer"     "character"       "numeric" 
        Chg              ChgPct             Cq    Cm                       Oc           S5              S4      
    "character"     "character"     "character"     "character"     "character"     "character"     "character" 
        S3              S2              S1              B1              B2              B3              B4      
    "character"     "character"     "character"     "character"     "character"     "character"     "character" 
        B5               Sv5             Sv4             Sv3             Sv2             Sv1             Bv1    
    "character"     "character"     "character"     "character"     "character"     "character"     "character" 
         Bv2             Bv3             Bv4             Bv5                 BS         Bsratio     PreClosePrc 
    "character"     "character"     "character"     "character"     "character"     "character"     "character" 
        OpenPrc         Hp              Lp             ClosePrc     UpperLmtPrc     LowerLmtPrc          Tq     
    "character"     "character"     "character"     "character"     "character"     "character"     "character" 
Tm                 PreOpnIntrst       OpnIntrst     PreStlmtPrc        StlmtPrc        PreDelta           Delta 
    "character"     "character"     "character"     "character"     "character"     "character"       "numeric" 
  SettleGroupID        SettleID 
      "integer"     "character" 
> dt$UpdateMillisec[1]
[1] "500           "

Even when sep="\t" is explicitly specified, things remain the same:

> dt <- data.table::fread("Z:/IF_TICK/IFL1/20100416-20100630.txt",header = T,sep = "\t")
Read 100000 rows and 51 (of 51) columns from 0.041 GB file in 00:00:09
> colnames(dt)
 [1] "IFLxID"          "IFLxName"        "Ifcd  "          "Tdate   "        "Ttime "          "UpdateMillisec" 
 [7] "Cp     "         "Chg    "         "ChgPct"          "Cq "             "Cm           "   "Oc  "           
[13] "S5     "         "S4     "         "S3     "         "S2     "         "S1     "         "B1     "        
[19] "B2     "         "B3     "         "B4     "         "B5     "         "Sv5   "          "Sv4   "         
[25] "Sv3   "          "Sv2   "          "Sv1   "          "Bv1   "          "Bv2   "          "Bv3   "         
[31] "Bv4   "          "Bv5   "          "BS"              "Bsratio"         "PreClosePrc"     "OpenPrc"        
[37] "Hp     "         "Lp     "         "ClosePrc"        "UpperLmtPrc"     "LowerLmtPrc"     "Tq    "         
[43] "Tm             " "PreOpnIntrst"    "OpnIntrst"       "PreStlmtPrc"     "StlmtPrc"        "PreDelta"       
[49] "Delta"           "SettleGroupID"   "SettleID"       
> sapply(dt,class)
         IFLxID        IFLxName          Ifcd          Tdate             Ttime   UpdateMillisec         Cp      
    "character"     "character"     "character"       "integer"       "integer"     "character"       "numeric" 
        Chg              ChgPct             Cq    Cm                       Oc           S5              S4      
    "character"     "character"     "character"     "character"     "character"     "character"     "character" 
        S3              S2              S1              B1              B2              B3              B4      
    "character"     "character"     "character"     "character"     "character"     "character"     "character" 
        B5               Sv5             Sv4             Sv3             Sv2             Sv1             Bv1    
    "character"     "character"     "character"     "character"     "character"     "character"     "character" 
         Bv2             Bv3             Bv4             Bv5                 BS         Bsratio     PreClosePrc 
    "character"     "character"     "character"     "character"     "character"     "character"     "character" 
        OpenPrc         Hp              Lp             ClosePrc     UpperLmtPrc     LowerLmtPrc          Tq     
    "character"     "character"     "character"     "character"     "character"     "character"     "character" 
Tm                 PreOpnIntrst       OpnIntrst     PreStlmtPrc        StlmtPrc        PreDelta           Delta 
    "character"     "character"     "character"     "character"     "character"     "character"       "numeric" 
  SettleGroupID        SettleID 
      "integer"     "character" 
> dt$UpdateMillisec[1]
[1] "500           "

The white-spaces are represented by 20 in hex raw:

> x <- dt$UpdateMillisec[1]
> x
[1] "500           "
> charToRaw(x)
 [1] 35 30 30 20 20 20 20 20 20 20 20 20 20 20

Metadata

Metadata

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions