Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add join #31

Merged
merged 6 commits into from
Jun 28, 2014
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions cytoolz/itertoolz.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -148,3 +148,39 @@ cdef class _pluck_list_default:


cpdef object pluck(object ind, object seqs, object default=*)

cpdef object join(object leftkey, object leftseq,
object rightkey, object rightseq,
object left_default=*,
object right_default=*)

cdef class _join:
cdef Py_ssize_t n
cdef object iterseq
cdef object leftkey
cdef object leftseq
cdef object rightkey
cdef object rightseq
cdef object matches
cdef object right
cdef object key
cdef object d
cdef object d_items
cdef object seen_keys
cdef object is_rightseq_exhausted
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use cdef bint for a fast C boolean type that is compatible with Python bools.

cdef object left_default
cdef object right_default
cdef int i
cdef object keys

cdef class _inner_join(_join):
pass

cdef class _right_outer_join(_join):
pass

cdef class _left_outer_join(_join):
pass

cdef class _outer_join(_join):
pass
255 changes: 255 additions & 0 deletions cytoolz/itertoolz.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1061,6 +1061,261 @@ cpdef object pluck(object ind, object seqs, object default=no_default):
return _pluck_index_default(ind, seqs, default)


def getter(index):
if isinstance(index, list):
if len(index) == 1:
index = index[0]
return lambda x: (x[index],)
else:
return itemgetter(*index)
else:
return itemgetter(index)

cpdef object join(object leftkey, object leftseq,
object rightkey, object rightseq,
object left_default=no_default,
object right_default=no_default):
""" Join two sequences on common attributes

This is a semi-streaming operation. The LEFT sequence is fully evaluated
and placed into memory. The RIGHT sequence is evaluated lazily and so can
be arbitrarily large.

>>> friends = [('Alice', 'Edith'),
... ('Alice', 'Zhao'),
... ('Edith', 'Alice'),
... ('Zhao', 'Alice'),
... ('Zhao', 'Edith')]

>>> cities = [('Alice', 'NYC'),
... ('Alice', 'Chicago'),
... ('Dan', 'Syndey'),
... ('Edith', 'Paris'),
... ('Edith', 'Berlin'),
... ('Zhao', 'Shanghai')]

>>> # Vacation opportunities
>>> # In what cities do people have friends?
>>> result = join(second, friends,
... first, cities)
>>> for ((a, b), (c, d)) in sorted(unique(result)):
... print((a, d))
('Alice', 'Berlin')
('Alice', 'Paris')
('Alice', 'Shanghai')
('Edith', 'Chicago')
('Edith', 'NYC')
('Zhao', 'Chicago')
('Zhao', 'NYC')
('Zhao', 'Berlin')
('Zhao', 'Paris')

Specify outer joins with keyword arguments ``left_default`` and/or
``right_default``. Here is a full outer join in which unmatched elements
are paired with None.

>>> identity = lambda x: x
>>> list(join(identity, [1, 2, 3],
... identity, [2, 3, 4],
... left_default=None, right_default=None))
[(2, 2), (3, 3), (None, 4), (1, None)]

Usually the key arguments are callables to be applied to the sequences. If
the keys are not obviously callable then it is assumed that indexing was
intended, e.g. the following is a legal change

>>> # result = join(second, friends, first, cities)
>>> result = join(1, friends, 0, cities) # doctest: +SKIP
"""
return _join(leftkey, leftseq, rightkey, rightseq,
left_default, right_default)
if left_default == no_default and right_default == no_default:
return _inner_join(leftkey, leftseq, rightkey, rightseq,
left_default, right_default)
elif left_default != no_default and right_default == no_default:
return _right_outer_join(leftkey, leftseq, rightkey, rightseq,
left_default, right_default)
elif left_default == no_default and right_default != no_default:
return _left_outer_join(leftkey, leftseq, rightkey, rightseq,
left_default, right_default)
else:
return _outer_join(leftkey, leftseq, rightkey, rightseq,
left_default, right_default)

cdef class _join:
def __init__(self,
object leftkey, object leftseq,
object rightkey, object rightseq,
object left_default=no_default,
object right_default=no_default):
if not callable(leftkey):
leftkey = getter(leftkey)
if not callable(rightkey):
rightkey = getter(rightkey)

self.left_default = left_default
self.right_default = right_default

self.leftkey = leftkey
self.rightkey = rightkey
self.rightseq = iter(rightseq)

self.d = groupby(leftkey, leftseq)
self.seen_keys = set()
self.matches = ()
self.right = None

self.is_rightseq_exhausted = False


def __iter__(self):
return self

def __next__(self):
cdef PyObject *obj
if not self.is_rightseq_exhausted:
if self.i == len(self.matches):
try:
self.right = next(self.rightseq)
except StopIteration:
if self.right_default is no_default:
raise
self.is_rightseq_exhausted = True
self.keys = iter(self.d)
return next(self)
key = self.rightkey(self.right)
self.seen_keys.add(key)
obj = PyDict_GetItem(self.d, key)
if obj is NULL:
if self.left_default is not no_default:
return (self.left_default, self.right)
else:
return next(self)
self.matches = <object>obj
self.i = 0
match = <object>PyList_GET_ITEM(self.matches, self.i) # skip error checking
self.i += 1
return (match, self.right)

elif self.right_default is not no_default:
if self.i == len(self.matches):
key = next(self.keys)
while key in self.seen_keys:
key = next(self.keys)
obj = PyDict_GetItem(self.d, key)
self.matches = <object>obj
self.i = 0
match = <object>PyList_GET_ITEM(self.matches, self.i) # skip error checking
self.i += 1
return (match, self.right_default)


cdef class _right_outer_join(_join):
def __next__(self):
cdef PyObject *obj
if self.i == len(self.matches):
self.right = next(self.rightseq)
key = self.rightkey(self.right)
obj = PyDict_GetItem(self.d, key)
if obj is NULL:
return (self.left_default, self.right)
self.matches = <object>obj
self.i = 0
match = <object>PyList_GET_ITEM(self.matches, self.i) # skip error checking
self.i += 1
return (match, self.right)


cdef class _outer_join(_join):
def __next__(self):
cdef PyObject *obj
if not self.is_rightseq_exhausted:
if self.i == len(self.matches):
try:
self.right = next(self.rightseq)
except StopIteration:
self.is_rightseq_exhausted = True
self.keys = iter(self.d)
return next(self)
key = self.rightkey(self.right)
self.seen_keys.add(key)
obj = PyDict_GetItem(self.d, key)
if obj is NULL:
return (self.left_default, self.right)
self.matches = <object>obj
self.i = 0
match = <object>PyList_GET_ITEM(self.matches, self.i) # skip error checking
self.i += 1
return (match, self.right)

else:
if self.i == len(self.matches):
key = next(self.keys)
while key in self.seen_keys:
key = next(self.keys)
obj = PyDict_GetItem(self.d, key)
self.matches = <object>obj
self.i = 0
match = <object>PyList_GET_ITEM(self.matches, self.i) # skip error checking
self.i += 1
return (match, self.right_default)



cdef class _left_outer_join(_join):
def __next__(self):
cdef PyObject *obj
if not self.is_rightseq_exhausted:
if self.i == len(self.matches):
obj = NULL
while obj is NULL:
try:
self.right = next(self.rightseq)
except StopIteration:
self.is_rightseq_exhausted = True
self.keys = iter(self.d)
return next(self)
key = self.rightkey(self.right)
self.seen_keys.add(key)
obj = PyDict_GetItem(self.d, key)
self.matches = <object>obj
self.i = 0
match = <object>PyList_GET_ITEM(self.matches, self.i) # skip error checking
self.i += 1
return (match, self.right)

else:
if self.i == len(self.matches):
key = next(self.keys)
while key in self.seen_keys:
key = next(self.keys)
obj = PyDict_GetItem(self.d, key)
self.matches = <object>obj
self.i = 0
match = <object>PyList_GET_ITEM(self.matches, self.i) # skip error checking
self.i += 1
return (match, self.right_default)


cdef class _inner_join(_join):
def __iter__(self):
self.matches = ()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this necessary?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't. Removed. It was old cruft that hung around.

return self

def __next__(self):
cdef PyObject *obj = NULL
if self.i == len(self.matches):
while obj is NULL:
self.right = next(self.rightseq)
key = self.rightkey(self.right)
obj = PyDict_GetItem(self.d, key)
self.matches = <object>obj
self.i = 0
match = <object>PyList_GET_ITEM(self.matches, self.i) # skip error checking
self.i += 1
return (match, self.right)


# I find `_consume` convenient for benchmarking. Perhaps this belongs
# elsewhere, so it is private (leading underscore) and hidden away for now.

Expand Down
Loading