From ca24824f039b62dfbfac630a35b2ab52279cbd08 Mon Sep 17 00:00:00 2001
From: Faisal Dosani <faisal.dosani@pm.me>
Date: Sun, 29 Sep 2024 15:29:37 -0300
Subject: [PATCH 1/2] check for is_string_dtype and unsupported mixed type

---
 datacompy/__init__.py |  2 +-
 datacompy/core.py     | 22 ++++++++++++++++++----
 tests/test_core.py    | 38 +++++++++++++++++++-------------------
 3 files changed, 38 insertions(+), 24 deletions(-)

diff --git a/datacompy/__init__.py b/datacompy/__init__.py
index 8dfa816a..a1890224 100644
--- a/datacompy/__init__.py
+++ b/datacompy/__init__.py
@@ -18,7 +18,7 @@
 Then extended to carry that functionality over to Spark Dataframes.
 """
 
-__version__ = "0.13.3"
+__version__ = "0.14.0"
 
 import platform
 from warnings import warn
diff --git a/datacompy/core.py b/datacompy/core.py
index 0089dc38..0f2722b8 100644
--- a/datacompy/core.py
+++ b/datacompy/core.py
@@ -770,6 +770,11 @@ def columns_equal(
     - Non-numeric values (i.e. where np.isclose can't be used) will just
       trigger True on two nulls or exact matches.
 
+    Notes
+    -----
+    As of version ``0.14.0`` If a column is of a mixed data type the compare will
+    default to returning ``False``.
+
     Parameters
     ----------
     col_1 : Pandas.Series
@@ -792,6 +797,15 @@ def columns_equal(
         values don't match.
     """
     compare: pd.Series[bool]
+
+    # short circuit if comparing mixed type columns. We don't want to support this moving forward.
+    if pd.api.types.infer_dtype(col_1).startswith("mixed") or pd.api.types.infer_dtype(
+        col_1
+    ).startswith("mixed"):
+        compare = pd.Series(False, index=col_1.index)
+        compare.index = col_1.index
+        return compare
+
     try:
         compare = pd.Series(
             np.isclose(col_1, col_2, rtol=rel_tol, atol=abs_tol, equal_nan=True)
@@ -810,15 +824,15 @@ def columns_equal(
         except (ValueError, TypeError):
             try:
                 if ignore_spaces:
-                    if col_1.dtype.kind == "O":
+                    if col_1.dtype.kind == "O" and pd.api.types.is_string_dtype(col_1):
                         col_1 = col_1.str.strip()
-                    if col_2.dtype.kind == "O":
+                    if col_2.dtype.kind == "O" and pd.api.types.is_string_dtype(col_2):
                         col_2 = col_2.str.strip()
 
                 if ignore_case:
-                    if col_1.dtype.kind == "O":
+                    if col_1.dtype.kind == "O" and pd.api.types.is_string_dtype(col_1):
                         col_1 = col_1.str.upper()
-                    if col_2.dtype.kind == "O":
+                    if col_2.dtype.kind == "O" and pd.api.types.is_string_dtype(col_2):
                         col_2 = col_2.str.upper()
 
                 if {col_1.dtype.kind, col_2.dtype.kind} == {"M", "O"}:
diff --git a/tests/test_core.py b/tests/test_core.py
index 14298e09..482a12f4 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -98,7 +98,7 @@ def test_string_columns_equal_with_ignore_spaces():
 something||False
 |something|False
 ||True"""
-    df = pd.read_csv(io.StringIO(data), sep="|")
+    df = pd.read_csv(io.StringIO(data), sep="|", keep_default_na=False)
     actual_out = datacompy.columns_equal(df.a, df.b, rel_tol=0.2, ignore_spaces=True)
     expect_out = df["expected"]
     assert_series_equal(expect_out, actual_out, check_names=False)
@@ -119,7 +119,7 @@ def test_string_columns_equal_with_ignore_spaces_and_case():
 something||False
 |something|False
 ||True"""
-    df = pd.read_csv(io.StringIO(data), sep="|")
+    df = pd.read_csv(io.StringIO(data), sep="|", keep_default_na=False)
     actual_out = datacompy.columns_equal(
         df.a, df.b, rel_tol=0.2, ignore_spaces=True, ignore_case=True
     )
@@ -160,7 +160,7 @@ def test_date_columns_equal_with_ignore_spaces():
 2017-01-01||False
 |2017-01-01|False
 ||True"""
-    df = pd.read_csv(io.StringIO(data), sep="|")
+    df = pd.read_csv(io.StringIO(data), sep="|", keep_default_na=False)
     # First compare just the strings
     actual_out = datacompy.columns_equal(df.a, df.b, rel_tol=0.2, ignore_spaces=True)
     expect_out = df["expected"]
@@ -192,7 +192,7 @@ def test_date_columns_equal_with_ignore_spaces_and_case():
 2017-01-01||False
 |2017-01-01|False
 ||True"""
-    df = pd.read_csv(io.StringIO(data), sep="|")
+    df = pd.read_csv(io.StringIO(data), sep="|", keep_default_na=False)
     # First compare just the strings
     actual_out = datacompy.columns_equal(
         df.a, df.b, rel_tol=0.2, ignore_spaces=True, ignore_case=True
@@ -364,10 +364,10 @@ def test_infinity_and_beyond():
 def test_mixed_column():
     df = pd.DataFrame(
         [
-            {"a": "hi", "b": "hi", "expected": True},
-            {"a": 1, "b": 1, "expected": True},
-            {"a": np.inf, "b": np.inf, "expected": True},
-            {"a": Decimal("1"), "b": Decimal("1"), "expected": True},
+            {"a": "hi", "b": "hi", "expected": False},
+            {"a": 1, "b": 1, "expected": False},
+            {"a": np.inf, "b": np.inf, "expected": False},
+            {"a": Decimal("1"), "b": Decimal("1"), "expected": False},
             {"a": 1, "b": "1", "expected": False},
             {"a": 1, "b": "yo", "expected": False},
         ]
@@ -380,10 +380,10 @@ def test_mixed_column():
 def test_mixed_column_with_ignore_spaces():
     df = pd.DataFrame(
         [
-            {"a": "hi", "b": "hi ", "expected": True},
-            {"a": 1, "b": 1, "expected": True},
-            {"a": np.inf, "b": np.inf, "expected": True},
-            {"a": Decimal("1"), "b": Decimal("1"), "expected": True},
+            {"a": "hi", "b": "hi ", "expected": False},
+            {"a": 1, "b": 1, "expected": False},
+            {"a": np.inf, "b": np.inf, "expected": False},
+            {"a": Decimal("1"), "b": Decimal("1"), "expected": False},
             {"a": 1, "b": "1 ", "expected": False},
             {"a": 1, "b": "yo ", "expected": False},
         ]
@@ -396,15 +396,15 @@ def test_mixed_column_with_ignore_spaces():
 def test_mixed_column_with_ignore_spaces_and_case():
     df = pd.DataFrame(
         [
-            {"a": "hi", "b": "hi ", "expected": True},
-            {"a": 1, "b": 1, "expected": True},
-            {"a": np.inf, "b": np.inf, "expected": True},
-            {"a": Decimal("1"), "b": Decimal("1"), "expected": True},
+            {"a": "hi", "b": "hi ", "expected": False},
+            {"a": 1, "b": 1, "expected": False},
+            {"a": np.inf, "b": np.inf, "expected": False},
+            {"a": Decimal("1"), "b": Decimal("1"), "expected": False},
             {"a": 1, "b": "1 ", "expected": False},
             {"a": 1, "b": "yo ", "expected": False},
-            {"a": "Hi", "b": "hI ", "expected": True},
-            {"a": "HI", "b": "HI ", "expected": True},
-            {"a": "hi", "b": "hi ", "expected": True},
+            {"a": "Hi", "b": "hI ", "expected": False},
+            {"a": "HI", "b": "HI ", "expected": False},
+            {"a": "hi", "b": "hi ", "expected": False},
         ]
     )
     actual_out = datacompy.columns_equal(

From bb69cb8b1af553140c3972a73bb3d8b3dad76dee Mon Sep 17 00:00:00 2001
From: Faisal Dosani <faisal.dosani@pm.me>
Date: Wed, 9 Oct 2024 13:16:42 -0300
Subject: [PATCH 2/2] fixing typo

---
 datacompy/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datacompy/core.py b/datacompy/core.py
index 0f2722b8..f9a3a314 100644
--- a/datacompy/core.py
+++ b/datacompy/core.py
@@ -800,7 +800,7 @@ def columns_equal(
 
     # short circuit if comparing mixed type columns. We don't want to support this moving forward.
     if pd.api.types.infer_dtype(col_1).startswith("mixed") or pd.api.types.infer_dtype(
-        col_1
+        col_2
     ).startswith("mixed"):
         compare = pd.Series(False, index=col_1.index)
         compare.index = col_1.index