From 22fce11107c02ec36e9811edec655b93a73ac2f3 Mon Sep 17 00:00:00 2001 From: Eric Chen Date: Tue, 10 Jun 2025 13:00:25 -0700 Subject: [PATCH 1/5] Initial testcase provided in Issue --- pandas/tests/io/test_stata.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index e73de78847c8f..cbf245d42b7f9 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -2601,3 +2601,12 @@ def test_strl_missings(temp_file, version): ] ) df.to_stata(temp_file, version=version) + +@pytest.mark.parametrize("version", [117, 118, 119, None]) +def test_ascii_error(temp_file, version): + # GH #61583 + # Check that 2 byte long unicode characters doesn't cause export error + df = pd.DataFrame({'doubleByteCol': ['§'*1500]}) + df.to_stata(temp_file, write_index=0, version=version) + df_input = read_stata(temp_file) + tm.assert_frame_equal(df, df_input) \ No newline at end of file From bb411801cf387fb5af97a08ee1b242fe8c72ee60 Mon Sep 17 00:00:00 2001 From: Eric Chen Date: Tue, 10 Jun 2025 14:55:41 -0700 Subject: [PATCH 2/5] Replaced check for encoded with unencoded check to prevent edge cases where two values are different --- pandas/io/stata.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index cd290710ddbaa..271a1a96aa8fc 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2739,7 +2739,7 @@ def _encode_strings(self) -> None: encoded = self.data[col].str.encode(self._encoding) # If larger than _max_string_length do nothing if ( - max_len_string_array(ensure_object(encoded._values)) + max_len_string_array(ensure_object(self.data[col]._values)) <= self._max_string_length ): self.data[col] = encoded @@ -3263,11 +3263,15 @@ def generate_blob(self, gso_table: dict[str, tuple[int, int]]) -> bytes: bio.write(gso_type) # llll - utf8_string = bytes(strl, "utf-8") - bio.write(struct.pack(len_type, len(utf8_string) + 1)) + if(type(strl) == str): + strl_convert = bytes(strl, "utf-8") + else: + strl_convert = strl + + bio.write(struct.pack(len_type, len(strl_convert) + 1)) # xxx...xxx - bio.write(utf8_string) + bio.write(strl_convert) bio.write(null) return bio.getvalue() From c3251bc606b2fdb54f63c628e71472f99eeb0860 Mon Sep 17 00:00:00 2001 From: Eric Chen Date: Tue, 10 Jun 2025 15:31:01 -0700 Subject: [PATCH 3/5] replaced type check with isinstance() --- pandas/io/stata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 271a1a96aa8fc..77adea6b02c61 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -3263,7 +3263,7 @@ def generate_blob(self, gso_table: dict[str, tuple[int, int]]) -> bytes: bio.write(gso_type) # llll - if(type(strl) == str): + if(isinstance(strl, str)): strl_convert = bytes(strl, "utf-8") else: strl_convert = strl From 4a13c49b618b7aaa06dbfe0539d52c7091cd2c13 Mon Sep 17 00:00:00 2001 From: Eric Chen Date: Tue, 10 Jun 2025 15:33:05 -0700 Subject: [PATCH 4/5] Updated patch notes --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 03a386708323d..1e315906e8240 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -775,6 +775,7 @@ I/O - Bug in :meth:`DataFrame.to_stata` when exporting a column containing both long strings (Stata strL) and :class:`pd.NA` values (:issue:`23633`) - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`) - Bug in :meth:`DataFrame.to_stata` when writing more than 32,000 value labels. (:issue:`60107`) +- Bug in :meth:`DataFrame.to_stata` when input encoded length and normal length are mismatched (:issue:`61583`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) - Bug in :meth:`HDFStore.select` causing queries on categorical string columns to return unexpected results (:issue:`57608`) From 0ca0c9ece8d5352f7f31df119a360a8c132589f5 Mon Sep 17 00:00:00 2001 From: Eric Chen Date: Tue, 10 Jun 2025 15:41:14 -0700 Subject: [PATCH 5/5] pre-commit checks --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/io/stata.py | 4 ++-- pandas/tests/io/test_stata.py | 5 +++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 1e315906e8240..61951e25bb35f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -773,9 +773,9 @@ I/O - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) - Bug in :meth:`DataFrame.to_excel` where the :class:`MultiIndex` index with a period level was not a date (:issue:`60099`) - Bug in :meth:`DataFrame.to_stata` when exporting a column containing both long strings (Stata strL) and :class:`pd.NA` values (:issue:`23633`) +- Bug in :meth:`DataFrame.to_stata` when input encoded length and normal length are mismatched (:issue:`61583`) - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`) - Bug in :meth:`DataFrame.to_stata` when writing more than 32,000 value labels. (:issue:`60107`) -- Bug in :meth:`DataFrame.to_stata` when input encoded length and normal length are mismatched (:issue:`61583`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) - Bug in :meth:`HDFStore.select` causing queries on categorical string columns to return unexpected results (:issue:`57608`) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 77adea6b02c61..092c24f0d31c3 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -3263,11 +3263,11 @@ def generate_blob(self, gso_table: dict[str, tuple[int, int]]) -> bytes: bio.write(gso_type) # llll - if(isinstance(strl, str)): + if isinstance(strl, str): strl_convert = bytes(strl, "utf-8") else: strl_convert = strl - + bio.write(struct.pack(len_type, len(strl_convert) + 1)) # xxx...xxx diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index cbf245d42b7f9..b155c0cca4aa6 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -2602,11 +2602,12 @@ def test_strl_missings(temp_file, version): ) df.to_stata(temp_file, version=version) + @pytest.mark.parametrize("version", [117, 118, 119, None]) def test_ascii_error(temp_file, version): # GH #61583 # Check that 2 byte long unicode characters doesn't cause export error - df = pd.DataFrame({'doubleByteCol': ['§'*1500]}) + df = DataFrame({"doubleByteCol": ["§" * 1500]}) df.to_stata(temp_file, write_index=0, version=version) df_input = read_stata(temp_file) - tm.assert_frame_equal(df, df_input) \ No newline at end of file + tm.assert_frame_equal(df, df_input)