Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

See what happens if we don't track thrift i32 #925

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 18 additions & 21 deletions fastparquet/cencoding.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ cpdef uint64_t read_unsigned_var_int(NumpyIO file_obj):
cdef uint64_t result = 0
cdef int32_t shift = 0
cdef char byte
cdef char * inptr = file_obj.get_pointer()
cdef char * inptr = file_obj.ptr + file_obj.loc # file_obj.get_pointer()

while True:
byte = inptr[0]
Expand All @@ -185,7 +185,7 @@ cpdef uint64_t read_unsigned_var_int(NumpyIO file_obj):
if (byte & 0x80) == 0:
break
shift += 7
file_obj.loc += inptr - file_obj.get_pointer()
file_obj.loc += inptr - (file_obj.ptr + file_obj.loc)
return result


Expand Down Expand Up @@ -222,7 +222,9 @@ cdef void delta_read_bitpacked(NumpyIO file_obj, uint8_t bitwidth,
uint64_t mask = 0XFFFFFFFFFFFFFFFF >> (64 - bitwidth)
while count > 0:
if (left - right) < bitwidth:
data = data | (<uint64_t>file_obj.read_byte() << left)
# data = data | (<uint64_t>file_obj.read_byte() << left)
data = data | (<uint64_t>file_obj.ptr[file_obj.loc] << left)
file_obj.loc += 1
left += 8
elif right > 8:
data >>= 8
Expand Down Expand Up @@ -524,24 +526,19 @@ cpdef dict read_thrift(NumpyIO data):
cdef char byte, id = 0, bit
cdef int32_t size
cdef dict out = {}
cdef bint hasi64 = 0
cdef bint hasi32 = 0
cdef list i32 = None
while True:
byte = data.read_byte()
# byte = data.read_byte()
byte = data.ptr[data.loc]
data.loc += 1

if byte == 0:
break
id += (byte & 0b11110000) >> 4
bit = byte & 0b00001111
if bit == 5:
out[id] = zigzag_long(read_unsigned_var_int(data))
hasi32 = True
if i32 is None:
i32 = list()
i32.append(id)
elif bit == 6:
out[id] = zigzag_long(read_unsigned_var_int(data))
hasi64 = True
elif bit == 7:
out[id] = <double>data.get_pointer()[0]
data.seek(8, 1)
Expand All @@ -562,21 +559,21 @@ cpdef dict read_thrift(NumpyIO data):
out[id] = zigzag_long(read_unsigned_var_int(data))
elif bit == 3:
# I8
out[id] = data.read_byte()
# out[id] = data.read_byte()
out[id] = data.ptr[data.loc]
data.loc = 1
else:
print("Corrupted thrift data at ", data.tell(), ": ", id, bit)
if hasi32:
if hasi64:
out["i32list"] = i32
else:
out["i32"] = 1
return out


cdef list read_list(NumpyIO data):
cdef unsigned char byte, typ
cdef int32_t size, bsize, _
byte = data.read_byte()
# byte = data.read_byte()
byte = data.ptr[data.loc]
data.loc += 1

if byte >= 0xf0: # 0b11110000
size = read_unsigned_var_int(data)
else:
Expand All @@ -590,8 +587,8 @@ cdef list read_list(NumpyIO data):
for _ in range(size):
# all parquet list types contain str, not bytes
bsize = read_unsigned_var_int(data)
out.append(PyUnicode_DecodeUTF8(data.get_pointer(), bsize, "ignore"))
data.seek(bsize, 1)
out.append(PyUnicode_DecodeUTF8(data.ptr + data.loc, bsize, "ignore"))
data.loc += bsize
else:
for _ in range(size):
out.append(read_thrift(data))
Expand Down
Loading