Skip to content

Commit

Permalink
add diff for join columns (#339)
Browse files Browse the repository at this point in the history
  • Loading branch information
rhaffar authored Oct 17, 2024
1 parent 67d0fa2 commit 1078a82
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 4 deletions.
5 changes: 4 additions & 1 deletion datacompy/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,10 @@ def _validate_dataframe(
dataframe.columns = pd.Index([str(col) for col in dataframe.columns])
# Check if join_columns are present in the dataframe
if not set(self.join_columns).issubset(set(dataframe.columns)):
raise ValueError(f"{index} must have all columns from join_columns")
missing_cols = set(self.join_columns) - set(dataframe.columns)
raise ValueError(
f"{index} must have all columns from join_columns: {missing_cols}"
)

if len(set(dataframe.columns)) < len(dataframe.columns):
raise ValueError(f"{index} must have unique column names")
Expand Down
5 changes: 4 additions & 1 deletion datacompy/polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,10 @@ def _validate_dataframe(

# Check if join_columns are present in the dataframe
if not set(self.join_columns).issubset(set(dataframe.columns)):
raise ValueError(f"{index} must have all columns from join_columns")
missing_cols = set(self.join_columns) - set(dataframe.columns)
raise ValueError(
f"{index} must have all columns from join_columns: {missing_cols}"
)

if len(set(dataframe.columns)) < len(dataframe.columns):
raise ValueError(f"{index} must have unique column names")
Expand Down
5 changes: 4 additions & 1 deletion datacompy/spark/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,10 @@ def _validate_dataframe(
dataframe.columns = [str(col) for col in dataframe.columns]
# Check if join_columns are present in the dataframe
if not set(self.join_columns).issubset(set(dataframe.columns)):
raise ValueError(f"{index} must have all columns from join_columns")
missing_cols = set(self.join_columns) - set(dataframe.columns)
raise ValueError(
f"{index} must have all columns from join_columns: {missing_cols}"
)

if len(set(dataframe.columns)) < len(dataframe.columns):
raise ValueError(f"{index} must have unique column names")
Expand Down
5 changes: 4 additions & 1 deletion datacompy/spark/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,10 @@ def _validate_dataframe(
# Check if join_columns are present in the dataframe
dataframe = getattr(self, index) # refresh
if not set(self.join_columns).issubset(set(dataframe.columns)):
raise ValueError(f"{index} must have all columns from join_columns")
missing_cols = set(self.join_columns) - set(dataframe.columns)
raise ValueError(
f"{index} must have all columns from join_columns: {missing_cols}"
)

if len(set(dataframe.columns)) < len(dataframe.columns):
raise ValueError(f"{index} must have unique column names")
Expand Down

0 comments on commit 1078a82

Please sign in to comment.