Skip to content

Commit

Permalink
add one-pass parsing of ints/floats based on @c-blake's work
Browse files Browse the repository at this point in the history
This uses @c-blake's code from
nim-lang/Nim#16055
to avoid having to copy the string which we parse and instead directly
parses a number into either an int or a string.

Note that this implementation has some edge cases that break it. In
particular a number that starts valid, but doesn't end as a valid
number (which is technically ok, but will break our CSV parser, since
our position in the memfile will *not* be on the next separator!).
  • Loading branch information
Vindaar committed Dec 30, 2020
1 parent e52c211 commit a1cdb80
Showing 1 changed file with 124 additions and 23 deletions.
147 changes: 124 additions & 23 deletions src/ggplotnim/ggplot_io.nim
Original file line number Diff line number Diff line change
Expand Up @@ -107,44 +107,140 @@ template guessType(data: ptr UncheckedArray[char], buf: var string,
else:
colTypes[col] = colString

proc i64(c: char): int {.inline.} = int(ord(c) - ord('0'))

proc pow10(e: int): float {.inline.} =
const p10 = [1e-22, 1e-21, 1e-20, 1e-19, 1e-18, 1e-17, 1e-16, 1e-15, 1e-14,
1e-13, 1e-12, 1e-11, 1e-10, 1e-09, 1e-08, 1e-07, 1e-06, 1e-05,
1e-4, 1e-3, 1e-2, 1e-1, 1.0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7,
1e8, 1e9] # 4*64B cache lines = 32 slots
if -22 <= e and e <= 9:
return p10[e + 22] # common case=small table lookup
result = 1.0
var base = 10.0
var e = e
if e < 0:
e = -e
base = 0.1
while e != 0:
if (e and 1) != 0:
result *= base
e = e shr 1
base *= base

type
RetType = enum
rtInt, rtFloat, rtError

proc parseNumber(data: ptr UncheckedArray[char],
idxIn: var int,
intVal: var int, floatVal: var float, strVal: var string): RetType {.inline, noInit.} =
## this code is taken and adapted from @c-blake's code in Nim PR #16055.
# Parse/validate/classify all at once, returning the type we parsed into
# and if not `rtError` the `intVal/floatVal` will store the parsed number
## TODO:
## This code has not been fully adapted yet. Instead of returning `rtError` under
## some circumstance, we have to copyMem into `buf`.
## We also need more error conditions. While this is only called if we *think*
## the data is a number, what happens to `1.23eOhNoNaN`?
const Sign = {'+', '-'} # NOTE: `parseFloat` can generalize this to INF/NAN.
var idx = idxIn
var noDot = false
var exp = 0
var p10 = 0
var pnt = -1 # find '.' (point); do digits
var nD = 0
var giant = false
intVal = 0 # build intVal up from zero..
if data[idx] in Sign:
idx.inc # skip optional sign
while data[idx] != '\0': # ..and track scale/pow10.
if data[idx] notin Digits:
if data[idx] != '.' or pnt >= 0:
break # a second '.' is forbidden
pnt = nD # save location of '.' (point)
nD.dec # undo loop's nD.inc
elif nD < 18: # 2**63==9.2e18 => 18 digits ok
intVal = 10 * intVal + data[idx].i64 # core ASCII->binary transform
else: # 20+ digits before decimal
giant = true #XXX condition should be more precise than "18 digits"
p10.inc # any digit moves implicit '.'
idx.inc
nD.inc
if data[idxIn] == '-':
intVal = -intVal # adjust sign
if pnt < 0: # never saw '.'
pnt = nD; noDot = true # so set to number of digits
elif nD == 1:
return rtError # ONLY "[+-]*\.*"
if data[idx] in {'E', 'e'}: # optional exponent
idx.inc
let i0 = idx
if data[idx] in Sign:
idx.inc # skip optional sign
while data[idx] in Digits: # build exponent
exp = 10 * exp + data[idx].i64
idx.inc
if data[i0] == '-':
exp = -exp # adjust sign
elif noDot: # and intVal < (1'i64 shl 53'i64) ? # No '.' & No [Ee]xponent
idxIn = idx
if giant:
copyBuf(data, strVal, idx, idxIn)
return rtInt # mark as integer
exp += pnt - nD + p10 # combine explicit&implicit exp
floatVal = intVal.float * pow10(exp) # has round-off vs. 80-bit
if giant:
copyBuf(data, strVal, idx, idxIn)
idxIn = idx
result = rtFloat # mark as float

template parseCol(data: ptr UncheckedArray[char], buf: var string, col: var Column,
colTypes: seq[ColKind], colIdx, idx, colStart, row: int): untyped =
copyBuf(data, buf, idx, colStart)
colTypes: seq[ColKind], colIdx, idx, colStart, row: int,
intVal: var int, floatVal: var float, rtType: var RetType): untyped =
case colTypes[colIdx]
of colInt:
try:
col.iCol[row] = parseInt buf
except ValueError:
try:
# before we copy everything check if can be parsed to float, this branch will only
# be called a single time
let fVal = parseFloat buf
col = toColumn col.iCol.asType(float)
col.fCol[row] = fVal
colTypes[colIdx] = colFloat
except ValueError:
# object column
col = toObjectColumn col
colTypes[colIdx] = colObject
col.oCol[row] = %~ buf
retType = parseNumber(data, colStart, intVal, floatVal, buf)
idx = colStart
case retType
of rtInt: col.iCol[row] = intVal
of rtFloat:
# before we copy everything check if can be parsed to float, this branch will only
# be called a single time
col = toColumn col.iCol.asType(float)
col.fCol[row] = floatVal
colTypes[colIdx] = colFloat
of rtError:
# object column
col = toObjectColumn col
colTypes[colIdx] = colObject
col.oCol[row] = %~ buf
of colFloat:
try:
col.fCol[row] = parseFloat buf
except ValueError:
retType = parseNumber(data, colStart, intVal, floatVal, buf)
idx = colStart
case retType
of rtInt: col.fCol[row] = intVal.float
of rtFloat: col.fCol[row] = floatVal
of rtError:
# object column
col = toObjectColumn col
colTypes[colIdx] = colObject
col.oCol[row] = %~ buf
of colBool:
copyBuf(data, buf, idx, colStart)
try:
col.bCol[row] = parseBool buf
except ValueError:
# object column
col = toObjectColumn col
colTypes[colIdx] = colObject
col.oCol[row] = %~ buf
of colString: col.sCol[row] = buf
of colObject: col.oCol[row] = %~ buf
of colString:
copyBuf(data, buf, idx, colStart)
col.sCol[row] = buf
of colObject:
copyBuf(data, buf, idx, colStart)
col.oCol[row] = %~ buf
of colConstant: discard # already set
of colNone: doAssert false, "Invalid column to parse into: `colNone`"

Expand Down Expand Up @@ -212,9 +308,14 @@ proc readCsvTyped*(fname: string): DataFrame =
cols[i] = newColumn(colTypes[i], lineCnt - 1) # -1 because of header
# 4. parse the actual data
doAssert row >= 0, "Parsing the header failed"
var
retType: RetType
intVal: int
floatVal: float
while idx < ff.size:
parseLine(data, buf, col, idx, colStart, row, toBreak = false):
parseCol(data, buf, cols[col], colTypes, col, idx, colStart, row)
parseCol(data, buf, cols[col], colTypes, col, idx, colStart, row,
intVal, floatVal, retType)
for i, col in colNames:
result[col] = cols[i]
result.len = row
Expand Down

0 comments on commit a1cdb80

Please sign in to comment.