Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add TimeBoundedPopScore for time-bounded popularity #493

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 59 additions & 6 deletions lenskit/lenskit/algorithms/basic.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# This file is part of LensKit.

Check failure on line 1 in lenskit/lenskit/algorithms/basic.py

View workflow job for this annotation

GitHub Actions / Check pre-commit hooks

pre-commit

pre-commit would modify this file
# Copyright (C) 2018-2023 Boise State University
# Copyright (C) 2023-2024 Drexel University
# Licensed under the MIT license, see LICENSE.md for details.
Expand All @@ -8,23 +8,24 @@
Basic utility algorithms and combiners.
"""

from datetime import datetime, timedelta
import logging
from collections.abc import Iterable, Sequence
from typing import overload

import numpy as np
import pandas as pd
from typing_extensions import override

from lenskit.algorithms import CandidateSelector, Predictor, Recommender
from lenskit.algorithms.bias import Bias # noqa: F401
from lenskit.algorithms.ranking import TopN # noqa: F401
from lenskit.data import Dataset
from lenskit.data.matrix import CSRStructure
from lenskit.data.vocab import Vocabulary
from lenskit.util import derivable_rng

_logger = logging.getLogger(__name__)

Check failure on line 28 in lenskit/lenskit/algorithms/basic.py

View workflow job for this annotation

GitHub Actions / Annotate with lint failures

Ruff (I001)

lenskit/lenskit/algorithms/basic.py:11:1: I001 Import block is un-sorted or un-formatted


class PopScore(Predictor):
Expand All @@ -41,7 +42,7 @@
- ``'count'``

Attributes:
item_pop_(pandas.Series):
item_scores_(pandas.Series):
Item popularity scores.
"""

Expand All @@ -51,8 +52,13 @@
@override
def fit(self, data: Dataset, **kwargs):
_logger.info("counting item popularity")
stats = data.item_stats()
scores = stats["count"]

counts = data.item_stats()["count"]
self.item_scores_ = self._fit_internal(counts, **kwargs)

return self

def _fit_internal(self, scores: pd.Series, **kwargs):
if self.score_method == "rank":
_logger.info("ranking %d items", len(scores))
scores = scores.rank().sort_index()
Expand All @@ -68,9 +74,7 @@
else:
raise ValueError("invalid scoring method " + repr(self.score_method))

self.item_scores_ = scores

return self
return scores

@override
def predict_for_user(self, user, items, ratings=None):
Expand All @@ -80,6 +84,55 @@
return "PopScore({})".format(self.score_method)


class TimeBoundedPopScore(PopScore):
"""
Score items by their time-bounded popularity, i.e., the popularity in the
most recent `time_window` period. Use with :py:class:`TopN` to get a
most-popular-recent-items recommender.

Args:
time_window(datetime.timedelta):
The time window for computing popularity scores.
score_type(str):
The method for computing popularity scores. Can be one of the following:

- ``'quantile'`` (the default)
- ``'rank'``
- ``'count'``

Attributes:
item_scores_(pandas.Series):
Time-bounded item popularity scores.
"""

def __init__(self, time_window: timedelta, score_method="quantile"):
super().__init__(score_method)

self.time_window = time_window
self.score_method = score_method

@override
def fit(self, data: Dataset, **kwargs):
_logger.info("counting time-bounded item popularity")

log = data.interaction_log("numpy")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking about adding a function to time bound a DataSet, but realized that might be too much since we are just going to use the counts - I assume that would still be useful for some things (validation set?) but doing the calculation here seems cheaper. LMK if this seems right!


counts = np.zeros(data.item_count, dtype=np.int32)
start_timestamp = (datetime.now() - self.time_window).timestamp()
item_nums = log.item_nums[log.timestamps is not None and log.timestamps > start_timestamp]
np.add.at(counts, item_nums, 1)

self.item_scores_ = super()._fit_internal(
pd.Series(counts, index=data.items.index), **kwargs
)

return self

@override
def __str__(self):
return "TimeBoundedPopScore({}, {})".format(self.time_window, self.score_method)


class Memorized(Predictor):
"""
The memorized algorithm memorizes socres provided at construction time
Expand Down
61 changes: 61 additions & 0 deletions lenskit/tests/algorithms/test_time_bounded_popular.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# This file is part of LensKit.

Check failure on line 1 in lenskit/tests/algorithms/test_time_bounded_popular.py

View workflow job for this annotation

GitHub Actions / Check pre-commit hooks

pre-commit

pre-commit would modify this file
# Copyright (C) 2018-2023 Boise State University
# Copyright (C) 2023-2024 Drexel University
# Licensed under the MIT license, see LICENSE.md for details.
# SPDX-License-Identifier: MIT

from datetime import datetime, timedelta
import pickle

import numpy as np
import pandas as pd

from lenskit.lenskit.data.convert import from_interactions_df
from lenskit.algorithms import basic

day = timedelta(days=1)

Check failure on line 16 in lenskit/tests/algorithms/test_time_bounded_popular.py

View workflow job for this annotation

GitHub Actions / Annotate with lint failures

Ruff (I001)

lenskit/tests/algorithms/test_time_bounded_popular.py:7:1: I001 Import block is un-sorted or un-formatted
now = int(datetime.now().timestamp())
one_day_ago = now - day.total_seconds()
simple_df = pd.DataFrame(
{
"item": [1, 2, 2, 3],
"user": [10, 12, 10, 13],
"rating": [4.0, 3.0, 5.0, 2.0],
"timestamp": [now, one_day_ago, one_day_ago, one_day_ago],
}
)
simple_ds = from_interactions_df(simple_df)


def test_time_bounded_pop_score_quantile_one_day_window():
algo = basic.TimeBoundedPopScore(day)
algo.fit(simple_ds)
assert algo.item_scores_.equals(pd.Series([1.0, 0.0, 0.0], index=[1, 2, 3]))


def test_time_bounded_pop_score_quantile_two_day_window():
algo = basic.TimeBoundedPopScore(2 * day)
algo.fit(simple_ds)
assert algo.item_scores_.equals(pd.Series([0.25, 1.0, 0.5], index=[1, 2, 3]))


def test_time_bounded_pop_score_rank():
algo = basic.TimeBoundedPopScore(2 * day, "rank")
algo.fit(simple_ds)
assert algo.item_scores_.equals(pd.Series([1.5, 3.0, 1.5], index=[1, 2, 3]))
Copy link
Contributor Author

@ZiyaoWei ZiyaoWei Oct 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These tests here seems very low level but I think it is OK since the internals are important and are likely to be stable over time?



def test_time_bounded_pop_score_counts(rng):
algo = basic.TimeBoundedPopScore(2 * day, "count")
algo.fit(simple_ds)
assert algo.item_scores_.equals(pd.Series([1, 2, 1], index=[1, 2, 3], dtype=np.int32))


def test_time_bounded_pop_score_save_load():
original = basic.TimeBoundedPopScore(day)
original.fit(simple_ds)

mod = pickle.dumps(original)
algo = pickle.loads(mod)

assert all(algo.item_scores_ == original.item_scores_)
Loading