Loading...

Type: Question
Resolution: Unresolved
Priority: Unknown
Fix Version/s: None
Affects Version/s: None
Component/s: None
Labels:
None

Confidence Status:
None

Assigned Teams:

Python Drivers

Aha! Reference:
None
Tracking Level:
None
Risk Status:
None
Exec Notes:
None
Goal Name:
None
Goal Link:
None

Via

https://github.com/mongodb-labs/mongo-arrow/issues/242

Hello guys,
I am importing a big dataset from mongo:

`pd_confirmacao_conversao = find_arrow_all(pd_confirmacao_conversao, {'estadoContabilizacaoEvento': { '$lt': 100}})`

After that, I´ve just exported it to a pandas dataframe

`pd_confirmacao_conversao = pd_confirmacao_conversao.to_pandas()`

Mu issue is that my original dataframe contains two columns that contains ObjectIds ('_id' and 'referenciaConversao').
Because of that I try to run a df.info, it crashes!

```
pd_confirmacao_conversao.info()

---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[9], line 1
----> 1 pd_confirmacao_conversao.info()

File /projeto/libs/lib/python3.11/site-packages/pandas/core/frame.py:3659, in DataFrame.info(self, verbose, buf, max_cols, memory_usage, show_counts)
3646 @doc(INFO_DOCSTRING, **frame_sub_kwargs)
3647 def info(
3648 self,
(...)
3653 show_counts: bool | None = None,
3654 ) -> None:
3655 info = DataFrameInfo(
3656 data=self,
3657 memory_usage=memory_usage,
3658 )
-> 3659 info.render(
3660 buf=buf,
3661 max_cols=max_cols,
3662 verbose=verbose,
3663 show_counts=show_counts,
3664 )

File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:512, in DataFrameInfo.render(self, buf, max_cols, verbose, show_counts)
498 def render(
499 self,
500 *,
(...)
504 show_counts: bool | None,
505 ) -> None:
506 printer = _DataFrameInfoPrinter(
507 info=self,
508 max_cols=max_cols,
509 verbose=verbose,
510 show_counts=show_counts,
511 )
--> 512 printer.to_buffer(buf)

File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:583, in _InfoPrinterAbstract.to_buffer(self, buf)
581 """Save dataframe info into buffer."""
582 table_builder = self._create_table_builder()
--> 583 lines = table_builder.get_lines()
584 if buf is None: # pragma: no cover
585 buf = sys.stdout

File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:790, in _DataFrameTableBuilder.get_lines(self)
788 self._fill_empty_info()
789 else:
--> 790 self._fill_non_empty_info()
791 return self._lines

File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:960, in _DataFrameTableBuilderVerbose._fill_non_empty_info(self)
958 self.add_dtypes_line()
959 if self.display_memory_usage:
--> 960 self.add_memory_usage_line()

File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:820, in _DataFrameTableBuilder.add_memory_usage_line(self)
818 def add_memory_usage_line(self) -> None:
819 """Add line containing memory usage."""
--> 820 self._lines.append(f"memory usage: {self.memory_usage_string}")

File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:750, in _TableBuilderAbstract.memory_usage_string(self)
747 @property
748 def memory_usage_string(self) -> str:
749 """Memory usage string with proper size qualifier."""
--> 750 return self.info.memory_usage_string

File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:413, in _BaseInfo.memory_usage_string(self)
410 @property
411 def memory_usage_string(self) -> str:
412 """Memory usage in a form of human readable string."""
--> 413 return f"{_sizeof_fmt(self.memory_usage_bytes, self.size_qualifier)}\n"

File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:496, in DataFrameInfo.memory_usage_bytes(self)
493 @property
494 def memory_usage_bytes(self) -> int:
495 deep = self.memory_usage == "deep"
--> 496 return self.data.memory_usage(index=True, deep=deep).sum()

File /projeto/libs/lib/python3.11/site-packages/pandas/core/frame.py:3755, in DataFrame.memory_usage(self, index, deep)
3666 def memory_usage(self, index: bool = True, deep: bool = False) -> Series:
3667 """
3668 Return the memory usage of each column in bytes.
3669
(...)
3753 5244
3754 """
-> 3755 result = self._constructor_sliced(
3756 [c.memory_usage(index=False, deep=deep) for col, c in self.items()],
3757 index=self.columns,
3758 dtype=np.intp,
3759 )
3760 if index:
3761 index_memory_usage = self._constructor_sliced(
3762 self.index.memory_usage(deep=deep), index=["Index"]
3763 )

File /projeto/libs/lib/python3.11/site-packages/pandas/core/series.py:584, in Series._init_(self, data, index, dtype, name, copy, fastpath)
582 data = data.copy()
583 else:
--> 584 data = sanitize_array(data, index, dtype, copy)
586 manager = _get_option("mode.data_manager", silent=True)
587 if manager == "block":

File /projeto/libs/lib/python3.11/site-packages/pandas/core/construction.py:651, in sanitize_array(data, index, dtype, copy, allow_2d)
648 subarr = np.array([], dtype=np.float64)
650 elif dtype is not None:
--> 651 subarr = _try_cast(data, dtype, copy)
653 else:
654 subarr = maybe_convert_platform(data)

File /projeto/libs/lib/python3.11/site-packages/pandas/core/construction.py:818, in _try_cast(arr, dtype, copy)
813 # GH#15832: Check if we are requesting a numeric dtype and
814 # that we can convert the data to the requested dtype.
815 elif dtype.kind in "iu":
816 # this will raise if we have e.g. floats
--> 818 subarr = maybe_cast_to_integer_array(arr, dtype)
819 elif not copy:
820 subarr = np.asarray(arr, dtype=dtype)

File /projeto/libs/lib/python3.11/site-packages/pandas/core/dtypes/cast.py:1657, in maybe_cast_to_integer_array(arr, dtype)
1650 if not np_version_gt2:
1651 warnings.filterwarnings(
1652 "ignore",
1653 "NumPy will stop allowing conversion of "
1654 "out-of-bound Python int",
1655 DeprecationWarning,
1656 )
-> 1657 casted = np.asarray(arr, dtype=dtype)
1658 else:
1659 with warnings.catch_warnings():

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'method'
```

I Can fix it converting the dtype to strings, but I want to understand what should be the expected behavior without converting it.

Thanks in advance!

```
pandas 2.2.2
pyarrow 17.0.0
pymongo 4.8.0
pymongoarrow 1.5.1
```

Details

Description

Attachments

Activity

People

Dates