Uploaded image for project: 'Python Integrations'
  1. Python Integrations
  2. INTPYTHON-256

Handling ObjetctID after to_pandas

    • Type: Icon: Question Question
    • Resolution: Unresolved
    • Priority: Icon: Unknown Unknown
    • None
    • Affects Version/s: None
    • Component/s: None
    • None
    • Python Drivers

      Via 

      https://github.com/mongodb-labs/mongo-arrow/issues/242

      Hello guys,
      I am importing a big dataset from mongo:

      `pd_confirmacao_conversao = find_arrow_all(pd_confirmacao_conversao, {'estadoContabilizacaoEvento': { '$lt': 100}})`

      After that, I´ve just exported it to a pandas dataframe

      `pd_confirmacao_conversao = pd_confirmacao_conversao.to_pandas()`

      Mu issue is that my original dataframe contains two columns that contains ObjectIds ('_id' and 'referenciaConversao').
      Because of that I try to run a df.info, it crashes!

      ```
      pd_confirmacao_conversao.info()

      ---------------------------------------------------------------------------
      TypeError                                 Traceback (most recent call last)
      Cell In[9], line 1
      ----> 1 pd_confirmacao_conversao.info()

      File /projeto/libs/lib/python3.11/site-packages/pandas/core/frame.py:3659, in DataFrame.info(self, verbose, buf, max_cols, memory_usage, show_counts)
         3646 @doc(INFO_DOCSTRING, **frame_sub_kwargs)
         3647 def info(
         3648     self,
         (...)
         3653     show_counts: bool | None = None,
         3654 ) -> None:
         3655     info = DataFrameInfo(
         3656         data=self,
         3657         memory_usage=memory_usage,
         3658     )
      -> 3659     info.render(
         3660         buf=buf,
         3661         max_cols=max_cols,
         3662         verbose=verbose,
         3663         show_counts=show_counts,
         3664     )

      File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:512, in DataFrameInfo.render(self, buf, max_cols, verbose, show_counts)
          498 def render(
          499     self,
          500     *,
         (...)
          504     show_counts: bool | None,
          505 ) -> None:
          506     printer = _DataFrameInfoPrinter(
          507         info=self,
          508         max_cols=max_cols,
          509         verbose=verbose,
          510         show_counts=show_counts,
          511     )
      --> 512     printer.to_buffer(buf)

      File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:583, in _InfoPrinterAbstract.to_buffer(self, buf)
          581 """Save dataframe info into buffer."""
          582 table_builder = self._create_table_builder()
      --> 583 lines = table_builder.get_lines()
          584 if buf is None:  # pragma: no cover
          585     buf = sys.stdout

      File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:790, in _DataFrameTableBuilder.get_lines(self)
          788     self._fill_empty_info()
          789 else:
      --> 790     self._fill_non_empty_info()
          791 return self._lines

      File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:960, in _DataFrameTableBuilderVerbose._fill_non_empty_info(self)
          958 self.add_dtypes_line()
          959 if self.display_memory_usage:
      --> 960     self.add_memory_usage_line()

      File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:820, in _DataFrameTableBuilder.add_memory_usage_line(self)
          818 def add_memory_usage_line(self) -> None:
          819     """Add line containing memory usage."""
      --> 820     self._lines.append(f"memory usage: {self.memory_usage_string}")

      File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:750, in _TableBuilderAbstract.memory_usage_string(self)
          747 @property
          748 def memory_usage_string(self) -> str:
          749     """Memory usage string with proper size qualifier."""
      --> 750     return self.info.memory_usage_string

      File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:413, in _BaseInfo.memory_usage_string(self)
          410 @property
          411 def memory_usage_string(self) -> str:
          412     """Memory usage in a form of human readable string."""
      --> 413     return f"{_sizeof_fmt(self.memory_usage_bytes, self.size_qualifier)}\n"

      File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:496, in DataFrameInfo.memory_usage_bytes(self)
          493 @property
          494 def memory_usage_bytes(self) -> int:
          495     deep = self.memory_usage == "deep"
      --> 496     return self.data.memory_usage(index=True, deep=deep).sum()

      File /projeto/libs/lib/python3.11/site-packages/pandas/core/frame.py:3755, in DataFrame.memory_usage(self, index, deep)
         3666 def memory_usage(self, index: bool = True, deep: bool = False) -> Series:
         3667     """
         3668     Return the memory usage of each column in bytes.
         3669 
         (...)
         3753     5244
         3754     """
      -> 3755     result = self._constructor_sliced(
         3756         [c.memory_usage(index=False, deep=deep) for col, c in self.items()],
         3757         index=self.columns,
         3758         dtype=np.intp,
         3759     )
         3760     if index:
         3761         index_memory_usage = self._constructor_sliced(
         3762             self.index.memory_usage(deep=deep), index=["Index"]
         3763         )

      File /projeto/libs/lib/python3.11/site-packages/pandas/core/series.py:584, in Series._init_(self, data, index, dtype, name, copy, fastpath)
          582         data = data.copy()
          583 else:
      --> 584     data = sanitize_array(data, index, dtype, copy)
          586     manager = _get_option("mode.data_manager", silent=True)
          587     if manager == "block":

      File /projeto/libs/lib/python3.11/site-packages/pandas/core/construction.py:651, in sanitize_array(data, index, dtype, copy, allow_2d)
          648     subarr = np.array([], dtype=np.float64)
          650 elif dtype is not None:
      --> 651     subarr = _try_cast(data, dtype, copy)
          653 else:
          654     subarr = maybe_convert_platform(data)

      File /projeto/libs/lib/python3.11/site-packages/pandas/core/construction.py:818, in _try_cast(arr, dtype, copy)
          813 # GH#15832: Check if we are requesting a numeric dtype and
          814 # that we can convert the data to the requested dtype.
          815 elif dtype.kind in "iu":
          816     # this will raise if we have e.g. floats
      --> 818     subarr = maybe_cast_to_integer_array(arr, dtype)
          819 elif not copy:
          820     subarr = np.asarray(arr, dtype=dtype)

      File /projeto/libs/lib/python3.11/site-packages/pandas/core/dtypes/cast.py:1657, in maybe_cast_to_integer_array(arr, dtype)
         1650         if not np_version_gt2:
         1651             warnings.filterwarnings(
         1652                 "ignore",
         1653                 "NumPy will stop allowing conversion of "
         1654                 "out-of-bound Python int",
         1655                 DeprecationWarning,
         1656             )
      -> 1657         casted = np.asarray(arr, dtype=dtype)
         1658 else:
         1659     with warnings.catch_warnings():

      TypeError: int() argument must be a string, a bytes-like object or a real number, not 'method'
      ```

      I Can fix it converting the dtype to strings, but I want to understand what should be the expected behavior without converting it.

      Thanks in advance!

      ```
      pandas                            2.2.2
      pyarrow                          17.0.0
      pymongo                        4.8.0
      pymongoarrow               1.5.1
      ```

            Assignee:
            Unassigned Unassigned
            Reporter:
            alex.clark@mongodb.com Alex Clark
            Votes:
            0 Vote for this issue
            Watchers:
            1 Start watching this issue

              Created:
              Updated: