Skip to content

Commit

Permalink
Merge pull request #78 from martindurant/merge_ind
Browse files Browse the repository at this point in the history
Alternative merge algorithm without the builder
  • Loading branch information
martindurant authored Sep 13, 2024
2 parents 321b6e4 + 7a6acba commit 0b533d8
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 31 deletions.
70 changes: 40 additions & 30 deletions src/akimbo/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import awkward as ak
import fsspec
import numpy as np


def ak_to_series(ds, backend="pandas", extract=True):
Expand Down Expand Up @@ -159,31 +160,6 @@ def get_avro_schema(
return form


def _merge(ind1, ind2, builder):
"""numba jittable left join/merge index finder"""
len2 = len(ind2)
j = 0
for i in ind1:
builder.begin_list()
while True:
if j >= len2:
break
if i > ind2[j]:
# ID not yet found
j += 1
continue
if i < ind2[j]:
# no more entrie
break
# hit
while True:
builder.append(j)
j += 1
if j >= len2 or i != ind2[j]:
break
builder.end_list()


_jitted = [None]


Expand Down Expand Up @@ -223,10 +199,44 @@ def join(
merge = _jitted[0]
else:
merge = _merge
builder = ak.ArrayBuilder()
merge(table1[key], table2[key], builder)
merge_index = builder.snapshot()
indexed = table2[ak.flatten(merge_index)]
counts = ak.num(merge_index)

counts = np.empty(len(table1), dtype="uint64")
# TODO: the line below over-allocates, can swithch to somehing growable
matches = np.empty(len(table2), dtype="uint64")
# TODO: to_numpy(allow_missong) makes this a bit faster, but is not
# not GPU general
counts, matches, ind = merge(table1[key], table2[key], counts, matches)
matches.resize(int(ind), refcheck=False)
indexed = table2[matches]
listy = ak.unflatten(indexed, counts)
return ak.with_field(table1, listy, colname)


def _merge(ind1, ind2, counts, matches):
len2 = len(ind2)
j = 0
offind = 0
matchind = 0
last = 0
for i in ind1:
while True:
if j >= len2:
break
if i > ind2[j]:
# ID not yet found
j += 1
continue
if i < ind2[j]:
# no more entrie
break
# hit
while True:
matches[matchind] = j
j += 1
matchind += 1
if j >= len2 or i != ind2[j]:
break
counts[offind] = matchind - last
last = matchind
offind += 1
return counts, matches, matchind
4 changes: 3 additions & 1 deletion src/akimbo/mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,9 @@ def to_arrow(cls, data):
@property
def array(self) -> ak.Array:
"""Data as an awkward array"""
return ak.with_name(ak.from_arrow(self.arrow), self._behavior)
if self._behavior:
return ak.with_name(ak.from_arrow(self.arrow), self._behavior)
return ak.from_arrow(self.arrow)

@classmethod
def register_accessor(cls, name, klass):
Expand Down

0 comments on commit 0b533d8

Please sign in to comment.