Skip to content

Commit

Permalink
Add totality validation pandas-dev#58547
Browse files Browse the repository at this point in the history
  • Loading branch information
Falk B. Schimweg committed May 6, 2024
1 parent 1c0e031 commit 93ebe3d
Show file tree
Hide file tree
Showing 4 changed files with 190 additions and 45 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Other enhancements
- :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`)
- :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
- :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`)
- :meth:`DataFrame.merge` now supports validation of (left-/right-)totality (:issue:`58547`)
- :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`)
- Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`)
- Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`)
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,14 @@
* "many_to_one" or "m:1": check if merge keys are unique in right
dataset.
* "many_to_many" or "m:m": allowed, but does not result in checks.
* "total": check if all merge keys on each side are also present
on the other side
* "left_total": check if mere keys on the left side are all present
on the right side
* "right_total": check if merge keys on the right side are all present
on the left side
More than one merge type can be passed when separated by a ``+``.
Returns
-------
Expand Down
137 changes: 93 additions & 44 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1623,62 +1623,111 @@ def _validate_left_right_on(self, left_on, right_on):

@final
def _validate_validate_kwd(self, validate: str) -> None:
# Split validation string
validations = validate.split("+")

# Check uniqueness of each
if self.left_index:
left_unique = self.orig_left.index.is_unique
left_merge_index = self.orig_left.index
else:
left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique
left_merge_index = MultiIndex.from_arrays(self.left_join_keys)
left_unique = left_merge_index.is_unique

if self.right_index:
right_unique = self.orig_right.index.is_unique
right_merge_index = self.orig_right.index
else:
right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique
right_merge_index = MultiIndex.from_arrays(self.right_join_keys)
right_unique = right_merge_index.is_unique

# Check totality of each
intersect_index = left_merge_index.intersection(right_merge_index)
right_total = right_merge_index.sort_values().drop(intersect_index).empty
left_total = left_merge_index.sort_values().drop(intersect_index).empty

# Check data integrity
if validate in ["one_to_one", "1:1"]:
if not left_unique and not right_unique:
raise MergeError(
"Merge keys are not unique in either left "
"or right dataset; not a one-to-one merge"
)
if not left_unique:
raise MergeError(
"Merge keys are not unique in left dataset; not a one-to-one merge"
)
if not right_unique:
raise MergeError(
"Merge keys are not unique in right dataset; not a one-to-one merge"
)
for validation in validations:
if validation in ["one_to_one", "1:1"]:
if not left_unique and not right_unique:
raise MergeError(
"Merge keys are not unique in either left "
"or right dataset; not a one-to-one merge"
)
if not left_unique:
raise MergeError(
"Merge keys are not unique in left dataset; "
"not a one-to-one merge"
)
if not right_unique:
raise MergeError(
"Merge keys are not unique in right dataset; "
"not a one-to-one merge"
)

elif validate in ["one_to_many", "1:m"]:
if not left_unique:
raise MergeError(
"Merge keys are not unique in left dataset; not a one-to-many merge"
)
elif validation in ["one_to_many", "1:m"]:
if not left_unique:
raise MergeError(
"Merge keys are not unique in left dataset; "
"not a one-to-many merge"
)

elif validate in ["many_to_one", "m:1"]:
if not right_unique:
raise MergeError(
"Merge keys are not unique in right dataset; "
"not a many-to-one merge"
)
elif validation in ["many_to_one", "m:1"]:
if not right_unique:
raise MergeError(
"Merge keys are not unique in right dataset; "
"not a many-to-one merge"
)

elif validate in ["many_to_many", "m:m"]:
pass
elif validation in ["many_to_many", "m:m"]:
pass

else:
raise ValueError(
f'"{validate}" is not a valid argument. '
"Valid arguments are:\n"
'- "1:1"\n'
'- "1:m"\n'
'- "m:1"\n'
'- "m:m"\n'
'- "one_to_one"\n'
'- "one_to_many"\n'
'- "many_to_one"\n'
'- "many_to_many"'
)
elif validation in ["total"]:
if not left_total and not right_total:
raise MergeError(
"Neither the merge keys in the left dataset are all present "
"in the right dataset, nor the merge keys in the right "
"dataset all present in the left dataset; not a total merge."
)
if not left_total:
raise MergeError(
"Merge keys in left dataset are not all present in the right "
"dataset; not a total merge"
)
if not right_total:
raise MergeError(
"Merge keys in right dataset are not all present "
"in the left dataset; not a total merge"
)

elif validation in ["left_total"]:
if not left_total:
raise MergeError(
"Merge keys in left dataset are not all present "
"in the right dataset; not a left total merge"
)

elif validation in ["right_total"]:
if not right_total:
raise MergeError(
"Merge keys in right dataset are not all present "
"in the left dataset; not a right total merge"
)

else:
raise ValueError(
f'"{validation}" is not a valid argument. '
"Valid arguments are:\n"
'- "1:1"\n'
'- "1:m"\n'
'- "m:1"\n'
'- "m:m"\n'
'- "one_to_one"\n'
'- "one_to_many"\n'
'- "many_to_one"\n'
'- "many_to_many"\n'
'- "total"\n'
'- "left_total"\n'
'- "right_total"'
)


def get_join_indexers(
Expand Down
89 changes: 88 additions & 1 deletion pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1203,6 +1203,44 @@ def test_validation(self):
)
tm.assert_frame_equal(result, expected_3)

# Make sure left totality works
result = merge(
left,
right,
left_index=True,
right_index=True,
validate="one_to_one+left_total",
)
tm.assert_frame_equal(result, expected)

# Make sure right totality raises exception
msg = (
"Merge keys in right dataset are not all present in the left dataset; "
"not a right total merge"
)
with pytest.raises(MergeError, match=msg):
merge(
left,
right,
left_index=True,
right_index=True,
validate="one_to_one+right_total",
)

# Make sure general totality raises exception
msg = (
"Merge keys in right dataset are not all present in the left dataset; "
"not a total merge"
)
with pytest.raises(MergeError, match=msg):
merge(
left,
right,
left_index=True,
right_index=True,
validate="one_to_one+total",
)

# Dups on right
right_w_dups = concat([right, DataFrame({"a": ["e"], "c": ["moo"]}, index=[4])])
merge(
Expand All @@ -1213,6 +1251,14 @@ def test_validation(self):
validate="one_to_many",
)

merge(
left,
right_w_dups,
left_index=True,
right_index=True,
validate="left_total",
)

msg = "Merge keys are not unique in right dataset; not a one-to-one merge"
with pytest.raises(MergeError, match=msg):
merge(
Expand All @@ -1237,6 +1283,13 @@ def test_validation(self):
right_index=True,
validate="many_to_one",
)
merge(
left_w_dups,
right,
left_index=True,
right_index=True,
validate="left_total",
)

msg = "Merge keys are not unique in left dataset; not a one-to-one merge"
with pytest.raises(MergeError, match=msg):
Expand Down Expand Up @@ -1279,7 +1332,10 @@ def test_validation(self):
'- "one_to_one"\n'
'- "one_to_many"\n'
'- "many_to_one"\n'
'- "many_to_many"'
'- "many_to_many"\n'
'- "total"\n'
'- "left_total"\n'
'- "right_total"'
)
with pytest.raises(ValueError, match=msg):
merge(left, right, on="a", validate="jibberish")
Expand Down Expand Up @@ -1323,6 +1379,37 @@ def test_validation(self):
result = merge(left, right, on=["a", "b"], validate="1:1")
tm.assert_frame_equal(result, expected_multi)

right_total_ext = concat(
[right, DataFrame({"a": ["b"], "b": [1], "d": ["neigh"]}, index=[3])],
sort=True,
)
expected_total_ext = DataFrame(
{
"a": ["a", "a", "b", "b"],
"b": [0, 1, 0, 1],
"c": ["cat", "dog", "weasel", "horse"],
"d": ["meow", "bark", "um... weasel noise?", "neigh"],
},
index=range(4),
)
result = merge(left, right_total_ext, on=["a", "b"], validate="1:1+total")
tm.assert_frame_equal(result, expected_total_ext)

# Ensure not left total raises error
right_reduced = right.drop_duplicates(subset=["b"])

msg = (
"Merge keys in left dataset are not all present in the right dataset; "
"not a left total merge"
)
with pytest.raises(MergeError, match=msg):
merge(
left,
right_reduced,
on=["a", "b"],
validate="left_total",
)

def test_merge_two_empty_df_no_division_error(self):
# GH17776, PR #17846
a = DataFrame({"a": [], "b": [], "c": []})
Expand Down

0 comments on commit 93ebe3d

Please sign in to comment.