pandas-dev · mroeschke · Jun 3, 2024 · May 3, 2024 · May 9, 2024 · Jun 3, 2024
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -44,8 +44,8 @@ Other enhancements
 - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`)
 - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
 - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
+- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
 - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`)
--
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_300.notable_bug_fixes:

diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -91,7 +91,7 @@
 
 _version_error = (
     "Version of given Stata file is {version}. pandas supports importing "
-    "versions 105, 108, 110 (Stata 7), 111 (Stata 7SE), 113 (Stata 8/9), "
+    "versions 103, 104, 105, 108, 110 (Stata 7), 111 (Stata 7SE), 113 (Stata 8/9), "
     "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16),"
     "and 119 (Stata 15/16, over 32,767 variables)."
 )
@@ -1393,7 +1393,7 @@ def _get_seek_variable_labels(self) -> int:
 
     def _read_old_header(self, first_char: bytes) -> None:
         self._format_version = int(first_char[0])
-        if self._format_version not in [104, 105, 108, 110, 111, 113, 114, 115]:
+        if self._format_version not in [103, 104, 105, 108, 110, 111, 113, 114, 115]:
             raise ValueError(_version_error.format(version=self._format_version))
         self._set_encoding()
         self._byteorder = ">" if self._read_int8() == 0x1 else "<"
@@ -1405,7 +1405,8 @@ def _read_old_header(self, first_char: bytes) -> None:
 
         self._data_label = self._get_data_label()
 
-        self._time_stamp = self._get_time_stamp()
+        if self._format_version >= 105:
+            self._time_stamp = self._get_time_stamp()
 
         # descriptors
         if self._format_version >= 111:

diff --git a/pandas/tests/io/data/stata/stata-compat-103.dta b/pandas/tests/io/data/stata/stata-compat-103.dta
diff --git a/pandas/tests/io/data/stata/stata-compat-104.dta b/pandas/tests/io/data/stata/stata-compat-104.dta
diff --git a/pandas/tests/io/data/stata/stata-compat-be-103.dta b/pandas/tests/io/data/stata/stata-compat-be-103.dta
diff --git a/pandas/tests/io/data/stata/stata-compat-be-104.dta b/pandas/tests/io/data/stata/stata-compat-be-104.dta
diff --git a/pandas/tests/io/data/stata/stata4_103.dta b/pandas/tests/io/data/stata/stata4_103.dta
diff --git a/pandas/tests/io/data/stata/stata4_104.dta b/pandas/tests/io/data/stata/stata4_104.dta
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -267,7 +267,7 @@ def test_read_dta4(self, version, datapath):
         # stata doesn't save .category metadata
         tm.assert_frame_equal(parsed, expected)
 
-    @pytest.mark.parametrize("version", [105, 108])
+    @pytest.mark.parametrize("version", [103, 104, 105, 108])
     def test_readold_dta4(self, version, datapath):
         # This test is the same as test_read_dta4 above except that the columns
         # had to be renamed to match the restrictions in older file format
@@ -2011,6 +2011,18 @@ def test_backward_compat(version, datapath):
     tm.assert_frame_equal(old_dta, expected, check_dtype=False)
 
 
+@pytest.mark.parametrize("version", [103, 104])
+def test_backward_compat_nodateconversion(version, datapath):
+    # The Stata data format prior to 105 did not support a date format
+    # so read the raw values for comparison
+    data_base = datapath("io", "data", "stata")
+    ref = os.path.join(data_base, "stata-compat-118.dta")
+    old = os.path.join(data_base, f"stata-compat-{version}.dta")
+    expected = read_stata(ref, convert_dates=False)
+    old_dta = read_stata(old, convert_dates=False)
+    tm.assert_frame_equal(old_dta, expected, check_dtype=False)
+
+
 @pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114, 118])
 def test_bigendian(version, datapath):
     ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta")
@@ -2020,6 +2032,17 @@ def test_bigendian(version, datapath):
     tm.assert_frame_equal(big_dta, expected)
 
 
+@pytest.mark.parametrize("version", [103, 104])
+def test_bigendian_nodateconversion(version, datapath):
+    # The Stata data format prior to 105 did not support a date format
+    # so read the raw values for comparison
+    ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta")
+    big = datapath("io", "data", "stata", f"stata-compat-be-{version}.dta")
+    expected = read_stata(ref, convert_dates=False)
+    big_dta = read_stata(big, convert_dates=False)
+    tm.assert_frame_equal(big_dta, expected)
+
+
 def test_direct_read(datapath, monkeypatch):
     file_path = datapath("io", "data", "stata", "stata-compat-118.dta")