-
Type:
Bug
-
Resolution: Fixed
-
Priority:
Unknown
-
Affects Version/s: None
-
Component/s: None
-
None
As reported in https://github.com/mongodb-labs/mongo-arrow/issues/208.
We should be able to handle the following:
from pymongo import MongoClient import pymongoarrow.api as pmaapi import pyarrow.parquet as papq import pyarrow.json as pajson import io import json import bson client = MongoClient() collection = client.testdb.data; collection.drop(); client.testdb.data.insert_many([ { '_id': 1, 'foo': { 'bar': ['1','2'] } }, { '_id': 2, 'foo': { 'bar': [] } } ]) # get document out of mongo, put it in a file and read it with pyarrow and write it to parquet doc1 = client.testdb.data.find_one({'_id': 1}) string1 = bson.json_util.dumps(doc1, indent = 2) file1 = io.BytesIO(bytes(string1, encoding='utf-8')) papatable1 = pajson.read_json(file1) print(str(papatable1)) papq.write_table(papatable1, 'pyarrow' + str(1) + '.parquet') # read document with pymongoarrow and write it to parquet pmapatable1 = pmaapi.find_arrow_all(client.testdb.data,{'_id': {'$eq': 1}}) print(str(pmapatable1)) papq.write_table(pmapatable1, 'pymongoarrow' + str(1) + '.parquet') doc2 = client.testdb.data.find_one({'_id': 2}) string2 = bson.json_util.dumps(doc2, indent = 2) file2 = io.BytesIO(bytes(string2, encoding='utf-8')) papatable2 = pajson.read_json(file2) print(str(papatable2)) papq.write_table(papatable2, 'pyarrow' + str(2) + '.parquet') pmapatable2 = pmaapi.find_arrow_all(client.testdb.data,{'_id': {'$eq': 2}}) papq.write_table(pmapatable2, 'pymongoarrow' + str(2) + '.parquet')
Instead it produces:
$ python repro.py
pyarrow.Table
_id: int64
foo: struct<bar: list<item: string>>
child 0, bar: list<item: string>
child 0, item: string
----
_id: [[1]]
foo: [
-- is_valid: all not null
-- child 0 type: list<item: string>
[["1","2"]]]
pyarrow.Table
_id: int32
foo: struct<bar: list<item: string>>
child 0, bar: list<item: string>
child 0, item: string
----
_id: [[1]]
foo: [
-- is_valid: all not null
-- child 0 type: list<item: string>
[["1","2"]]]
pyarrow.Table
_id: int64
foo: struct<bar: list<item: null>>
child 0, bar: list<item: null>
child 0, item: null
----
_id: [[2]]
foo: [
-- is_valid: all not null
-- child 0 type: list<item: null>
[0 nulls]]
Traceback (most recent call last):
File "/workspaces/vscode-python/pymongoarrow/repro.py", line 45, in <module>
pmapatable2 = pmaapi.find_arrow_all(client.testdb.data,{'_id': {'$eq': 2}})
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/vscode/Envs/pma1/lib/python3.11/site-packages/pymongoarrow/api.py", line 112, in find_arrow_all
process_bson_stream(batch, context)
File "pymongoarrow/lib.pyx", line 159, in pymongoarrow.lib.process_bson_stream
File "pymongoarrow/lib.pyx", line 246, in pymongoarrow.lib.process_raw_bson_stream
File "pymongoarrow/lib.pyx", line 133, in pymongoarrow.lib.extract_document_dtype
File "pymongoarrow/lib.pyx", line 108, in pymongoarrow.lib.extract_field_dtype
File "pyarrow/types.pxi", line 4452, in pyarrow.lib.list_
TypeError: List requires DataType or Field
- has to be finished together with
-
INTPYTHON-165 Auto schema detection can yield different table on missing values
-
- Closed
-
- related to
-
INTPYTHON-575 Trouble reading documents with empty embedded arrays
-
- Closed
-