Skip to content

Commit

Permalink
adding Polars v1 tweaks for testing (#325)
Browse files Browse the repository at this point in the history
  • Loading branch information
fdosani authored Jul 18, 2024
1 parent 6918249 commit a42efa0
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 22 deletions.
2 changes: 1 addition & 1 deletion datacompy/polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ def _dataframe_merge(self, ignore_spaces: bool) -> None:
df1 = df1.with_columns(_merge_left=pl.lit(True))
df2 = df2.with_columns(_merge_right=pl.lit(True))

outer_join = df1.join(df2, how="outer_coalesce", join_nulls=True, **params)
outer_join = df1.join(df2, how="full", coalesce=True, join_nulls=True, **params)

# process merge indicator
outer_join = outer_join.with_columns(
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ maintainers = [
{ name="Faisal Dosani", email="[email protected]" }
]
license = {text = "Apache Software License"}
dependencies = ["pandas<=2.2.2,>=0.25.0", "numpy<=1.26.4,>=1.22.0", "ordered-set<=4.1.0,>=4.0.2", "fugue<=0.9.1,>=0.8.7", "polars<=0.20.31,>=0.20.4"]
dependencies = ["pandas<=2.2.2,>=0.25.0", "numpy<=1.26.4,>=1.22.0", "ordered-set<=4.1.0,>=4.0.2", "fugue<=0.9.1,>=0.8.7", "polars<=1.1.0,>=0.20.4"]
requires-python = ">=3.9.0"
classifiers = [
"Intended Audience :: Developers",
Expand Down
54 changes: 34 additions & 20 deletions tests/test_polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,18 +389,14 @@ def test_compare_df_setter_bad():
PolarsCompare(df, df.clone(), ["b"])
with raises(DuplicateError, match="duplicate column names found"):
PolarsCompare(df_same_col_names, df_same_col_names.clone(), ["a"])
assert (
PolarsCompare(df_dupe, df_dupe.clone(), ["a", "b"])
.df1.drop("_merge_left")
.equals(df_dupe)
)
assert PolarsCompare(df_dupe, df_dupe.clone(), ["a", "b"]).df1.equals(df_dupe)


def test_compare_df_setter_good():
df1 = pl.DataFrame([{"a": 1, "b": 2}, {"a": 2, "b": 2}])
df2 = pl.DataFrame([{"A": 1, "B": 2}, {"A": 2, "B": 3}])
compare = PolarsCompare(df1, df2, ["a"])
assert compare.df1.drop("_merge_left").equals(df1)
assert compare.df1.equals(df1)
assert compare.df2.equals(df2)
assert compare.join_columns == ["a"]
compare = PolarsCompare(df1, df2, ["A", "b"])
Expand Down Expand Up @@ -1177,10 +1173,12 @@ def test_all_mismatch_ignore_matching_cols_no_cols_matching():
"strings": ["1", "1", "1", "1.1", "1"],
"mixed_strings": ["1", "1", "1", "2", "some string"],
"infinity": [1, 1, 1, 1, np.inf],
}
},
strict=False,
)


@pytest.mark.skipif(pl.__version__ < "1.0.0", reason="polars breaking changes")
@pytest.mark.parametrize(
"column,expected",
[
Expand All @@ -1204,10 +1202,12 @@ def test_dupes_with_nulls():
{
"fld_1": [1, 2, 2, 3, 3, 4, 5, 5],
"fld_2": ["A", np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
}
},
strict=False,
)
df2 = pl.DataFrame(
{"fld_1": [1, 2, 3, 4, 5], "fld_2": ["A", np.nan, np.nan, np.nan, np.nan]}
{"fld_1": [1, 2, 3, 4, 5], "fld_2": ["A", np.nan, np.nan, np.nan, np.nan]},
strict=False,
)
comp = PolarsCompare(df1, df2, join_columns=["fld_1", "fld_2"])
assert comp.subset()
Expand All @@ -1216,37 +1216,51 @@ def test_dupes_with_nulls():
@pytest.mark.parametrize(
"dataframe,expected",
[
(pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}), pl.Series([1, 1, 1])),
(
pl.DataFrame({"a": ["a", "a", "DATACOMPY_NULL"], "b": [1, 1, 2]}),
pl.Series([1, 2, 1]),
pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}),
pl.Series([1, 1, 1], strict=False),
),
(
pl.DataFrame(
{"a": ["a", "a", "DATACOMPY_NULL"], "b": [1, 1, 2]}, strict=False
),
pl.Series([1, 2, 1], strict=False),
),
(pl.DataFrame({"a": [-999, 2, 3], "b": [1, 2, 3]}), pl.Series([1, 1, 1])),
(
pl.DataFrame({"a": [1, np.nan, np.nan], "b": [1, 2, 2]}),
pl.Series([1, 1, 2]),
pl.DataFrame({"a": [-999, 2, 3], "b": [1, 2, 3]}, strict=False),
pl.Series([1, 1, 1], strict=False),
),
(
pl.DataFrame({"a": ["1", np.nan, np.nan], "b": ["1", "2", "2"]}),
pl.Series([1, 1, 2]),
pl.DataFrame({"a": [1, np.nan, np.nan], "b": [1, 2, 2]}, strict=False),
pl.Series([1, 1, 2], strict=False),
),
(
pl.DataFrame(
{"a": [datetime(2018, 1, 1), None, None], "b": ["1", "2", "2"]}
{"a": ["1", np.nan, np.nan], "b": ["1", "2", "2"]}, strict=False
),
pl.Series([1, 1, 2]),
pl.Series([1, 1, 2], strict=False),
),
(
pl.DataFrame(
{"a": [datetime(2018, 1, 1), None, None], "b": ["1", "2", "2"]},
strict=False,
),
pl.Series([1, 1, 2], strict=False),
),
],
)
def test_generate_id_within_group(dataframe, expected):
assert (generate_id_within_group(dataframe, ["a", "b"]) == expected).all()


@pytest.mark.skipif(pl.__version__ < "1.0.0", reason="polars breaking changes")
@pytest.mark.parametrize(
"dataframe, message",
[
(
pl.DataFrame({"a": [1, np.nan, "DATACOMPY_NULL"], "b": [1, 2, 3]}),
pl.DataFrame(
{"a": [1, None, "DATACOMPY_NULL"], "b": [1, 2, 3]}, strict=False
),
"DATACOMPY_NULL was found in your join columns",
)
],
Expand Down

0 comments on commit a42efa0

Please sign in to comment.