Merge pull request #334 from mdekstrand/feature/rbp

Add RBP top-N metric
lenskit · Nov 3, 2023 · 3179988 · 3179988
2 parents c6299fa + f2205fc
commit 3179988
Show file tree

Hide file tree

Showing 2 changed files with 242 additions and 1 deletion.
diff --git a/lenskit/metrics/topn.py b/lenskit/metrics/topn.py
@@ -3,6 +3,7 @@
 """
 
 import logging
+import warnings
 import numpy as np
 import pandas as pd
 
@@ -269,7 +270,7 @@ def dcg(recs, truth, discount=np.log2):
 
 def ndcg(recs, truth, discount=np.log2, k=None):
     """
-    Compute the normalized discounted cumulative gain :cite:p:`Jarvelin2002-xf`.
+    Compute the normalized discounted cumulative gain :cite:p:`ndcg`.
 
     Discounted cumultative gain is computed as:
 
@@ -357,3 +358,101 @@ def _bulk_ndcg(recs, truth, discount=np.log2, k=None):
     dcg['ndcg'] = dcg['dcg'].fillna(0) / dcg['ideal']
 
     return dcg['ndcg']
+
+
+def rbp(recs, truth, k=None, patience=0.5, normalize=False):
+    """
+    Evaluate recommendations with rank-biased precision :cite:p:`rbp` with a
+    patience parameter :math:`\\gamma`.
+
+    If :math:`r_{ui} \\in \\{0, 1\\}` is binary implicit ratings, this is
+    computed by:
+
+    .. math::
+        \\begin{align*}
+        \\operatorname{RBP}_\\gamma(L, u) & =(1 - \\gamma) \\sum_i r_{ui} p^i
+        \\end{align*}
+
+    The original RBP metric depends on the idea that the rank-biased sum of
+    binary relevance scores in an infinitely-long, perfectly-precise list has is
+    :math:`1/(1 - \\gamma)`. However, in recommender evaluation, we usually have
+    a small test set, so the maximum achievable RBP is significantly less, and
+    is a function of the number of test items.  With ``normalize=True``, the RBP
+    metric will be normalized by the maximum achievable with the provided test
+    data.
+
+    Parameters:
+        recs: the recommendation list.
+        truth: the user's truth data.
+        k(int): the maximum recommendation list length.
+        patience(float): the patience parameter :math:`\\gamma`, the probability
+            that the user continues browsing at each point.
+        normalize(bool): whether to normalize the RBP
+            scores; if ``True``, divides the RBP score by the maximum achievable
+            with the test data.
+    """
+    if k is not None and k <= len(recs):
+        recs = recs.iloc[:k]
+    else:
+        k = len(recs)
+
+    if 'rank' not in recs.columns:
+        recs = recs.assign(rank=np.arange(1, len(recs)+1))
+
+    if np.min(recs['rank']) != 1:
+        warnings.warn('rank should start with 1')
+
+    nrel = len(truth)
+    if nrel == 0:
+        return None
+
+    good = recs['item'].isin(truth.index)
+    ranks = recs['rank'][good]
+    disc = patience ** (ranks - 1)
+    rbp = np.sum(disc)
+    if normalize:
+        # normalize by achievable RBP
+        max = np.sum(patience ** np.arange(min(nrel, k)))
+        # _log.info('rbp=%e, nrel=%d, eff=%d, max=%e', rbp, nrel, min(nrel, k), max)
+        return rbp / max
+    else:
+        # normal RBP normalization
+        return rbp * (1 - patience)
+
+
+@bulk_impl(rbp)
+def _bulk_rbp(recs, truth, k=None, patience=0.5, normalize=False):
+    if k is not None:
+        recs = recs[recs['rank'] <= k]
+
+    good = recs.join(truth, on=['LKTruthID', 'item'], how='inner')
+    good['rbp_disc'] = patience ** (good['rank'] - 1)
+    scores = good.groupby('LKRecID')['rbp_disc'].sum()
+
+    if normalize:
+        tns = truth.reset_index().groupby('LKTruthID')['item'].count()
+        if k is not None:
+            tns[tns > k] = k
+        max_nrel = np.max(tns)
+        # compute 0...k-1 (the powers of k-1 for 1..k)
+        kseq = np.arange(max_nrel)
+        # compute the discounts at each k-1
+        nd = patience ** kseq
+        # convert to a series of the sums, up through each k
+        max_rbps = pd.Series(np.cumsum(nd), index=kseq + 1)
+
+        # get a rec/truth mapping
+        map = recs[['LKRecID', 'LKTruthID']].drop_duplicates()
+        map.set_index('LKRecID', inplace=True)
+        map = map.reindex(scores.index)
+        # map to nrel, and then to the max RBPs
+        map = map.join(tns.to_frame('nrel'), on='LKTruthID', how='left')
+        map = map.join(max_rbps.to_frame('rbp_max'), on='nrel', how='left')
+
+        # divide each score by max RBP
+        scores /= map['rbp_max']
+    else:
+        scores *= (1 - patience)
+
+    scores = scores.reindex(recs['LKRecID'].unique(), fill_value=0)
+    return scores
diff --git a/tests/test_topn_rbp.py b/tests/test_topn_rbp.py
@@ -0,0 +1,142 @@
+import logging
+import numpy as np
+import pandas as pd
+
+from pytest import approx, mark
+from hypothesis import given
+import hypothesis.strategies as st
+import hypothesis.extra.numpy as nph
+
+from lenskit.metrics.topn import rbp, _bulk_rbp
+from lenskit.topn import RecListAnalysis
+from lenskit.util.test import demo_recs
+
+_log = logging.getLogger(__name__)
+
+
+def test_rbp_empty():
+    recs = pd.DataFrame({'item': []})
+    truth = pd.DataFrame({'item': [1, 2, 3], 'rating': [3.0, 5.0, 4.0]})
+    truth = truth.set_index('item')
+    assert rbp(recs, truth) == approx(0.0)
+
+
+def test_rbp_no_match():
+    recs = pd.DataFrame({'item': [4]})
+    truth = pd.DataFrame({'item': [1, 2, 3], 'rating': [3.0, 5.0, 4.0]})
+    truth = truth.set_index('item')
+    assert rbp(recs, truth) == approx(0.0)
+
+
+def test_rbp_one_match():
+    recs = pd.DataFrame({'item': [1]})
+    truth = pd.DataFrame({'item': [1, 2, 3], 'rating': [3.0, 5.0, 4.0]})
+    truth = truth.set_index('item')
+    assert rbp(recs, truth) == approx(0.5)
+
+
+@given(st.lists(st.integers(1), min_size=1, max_size=100, unique=True), st.floats(0.05, 0.95))
+def test_rbp_perfect(items, p):
+    n = len(items)
+    recs = pd.DataFrame({'item': items})
+    truth = pd.DataFrame({'item': items, 'rating': 1})
+    truth = truth.set_index('item').sort_index()
+    assert rbp(recs, truth, patience=p) == approx(np.sum(p ** np.arange(n)) * (1 - p))
+
+
+@given(st.lists(st.integers(1), min_size=1, max_size=100, unique=True), st.floats(0.05, 0.95))
+def test_rbp_perfect_norm(items, p):
+    recs = pd.DataFrame({'item': items})
+    truth = pd.DataFrame({'item': items, 'rating': 1})
+    truth = truth.set_index('item').sort_index()
+    assert rbp(recs, truth, patience=p, normalize=True) == approx(1.0)
+
+
+@given(st.lists(st.integers(1), min_size=1, max_size=100, unique=True),
+       st.integers(1, 100), st.floats(0.05, 0.95))
+def test_rbp_perfect_k(items, k, p):
+    n = len(items)
+    eff_n = min(n, k)
+    recs = pd.DataFrame({'item': items})
+    truth = pd.DataFrame({'item': items, 'rating': 1})
+    truth = truth.set_index('item').sort_index()
+    assert rbp(recs, truth, k=k, patience=p) == approx(np.sum(p ** np.arange(eff_n)) * (1 - p))
+
+
+@given(st.lists(st.integers(1), min_size=1, max_size=100, unique=True),
+       st.integers(1, 100), st.floats(0.05, 0.95))
+def test_rbp_perfect_k_norm(items, k, p):
+    recs = pd.DataFrame({'item': items})
+    truth = pd.DataFrame({'item': items, 'rating': 1})
+    truth = truth.set_index('item').sort_index()
+    assert rbp(recs, truth, k=k, patience=p, normalize=True) == approx(1.0)
+
+
+def test_rbp_missing():
+    recs = pd.DataFrame({'item': [1, 2]})
+    truth = pd.DataFrame({'item': [1, 2, 3], 'rating': [3.0, 5.0, 4.0]})
+    truth = truth.set_index('item').sort_index()
+    # (1 + 0.5) * 0.5
+    assert rbp(recs, truth) == approx(0.75)
+
+
+def test_rbp_bulk_at_top():
+    truth = pd.DataFrame.from_records([
+        (1, 50, 3.5),
+        (1, 30, 3.5)
+    ], columns=['LKTruthID', 'item', 'rating']).set_index(['LKTruthID', 'item'])
+
+    recs = pd.DataFrame.from_records([
+        (1, 1, 50, 1),
+        (1, 1, 30, 2),
+        (1, 1, 72, 3)
+    ], columns=['LKRecID', 'LKTruthID', 'item', 'rank'])
+
+    rbp = _bulk_rbp(recs, truth)
+    assert len(rbp) == 1
+    assert rbp.index.tolist() == [1]
+    assert rbp.iloc[0] == approx(0.75)
+
+
+def test_rbp_bulk_not_at_top():
+    truth = pd.DataFrame.from_records([
+        (1, 50, 3.5),
+        (1, 30, 3.5)
+    ], columns=['LKTruthID', 'item', 'rating']).set_index(['LKTruthID', 'item'])
+
+    recs = pd.DataFrame.from_records([
+        (1, 1, 50, 1),
+        (1, 1, 72, 2),
+        (1, 1, 30, 3)
+    ], columns=['LKRecID', 'LKTruthID', 'item', 'rank'])
+
+    rbp = _bulk_rbp(recs, truth)
+    assert len(rbp) == 1
+    assert rbp.index.tolist() == [1]
+    assert rbp.iloc[0] == approx((1 + 0.25) * 0.5)
+
+
+@mark.parametrize('normalize', [False, True])
+def test_rbp_bulk_match(demo_recs, normalize):
+    "bulk and normal match"
+    train, test, recs = demo_recs
+
+    rla = RecListAnalysis()
+    rla.add_metric(rbp, normalize=normalize)
+    rla.add_metric(rbp, name='rbp_k', k=5, normalize=normalize)
+    # metric without the bulk capabilities
+    rla.add_metric(lambda *a: rbp(*a, normalize=normalize), name='ind_rbp')
+    rla.add_metric(lambda *a, **k: rbp(*a, normalize=normalize, **k), name='ind_rbp_k', k=5)
+    res = rla.compute(recs, test)
+
+    res['diff'] = np.abs(res.rbp - res.ind_rbp)
+    rl = res.nlargest(5, 'diff')
+    _log.info('res:\n%s', rl)
+    user = rl.index[0]
+    _log.info('user: %s\n%s', user, rl.iloc[0])
+    _log.info('test:\n%s', test[test['user'] == user])
+    urecs = recs[recs['user'] == user].join(test.set_index(['user', 'item'])['rating'], on=['user', 'item'], how='left')
+    _log.info('recs:\n%s', urecs[urecs['rating'].notnull()])
+
+    assert res.rbp.values == approx(res.ind_rbp.values)
+    assert res.rbp_k.values == approx(res.ind_rbp_k.values)