For context i’m working on WIP: update repo by raybellwaves · Pull Request #4 · intake-dynamodb/intake-dynamodb · GitHub
My output is data : list[dict[str, Any]] which I would like to pass straight to dask.
I’m trying dd.from_dict(data, npartitions=1) but getting
TypeError: An error occurred while calling the from_dict method registered to the pandas backend.
Original Message: 'numpy.ndarray' object is not callable
Here’s a simple example.
import dask.dataframe as dd
import pandas as pd
data = [{"a": 1}, {"a": 2}]
pd.DataFrame(data)
dd.from_dict(data, npartitions=1)
The error here that ‘list’ object has no attribute ‘values’
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
File ~/miniforge3/envs/main/lib/python3.10/site-packages/dask/backends.py:135, in CreationDispatch.register_inplace.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
134 try:
--> 135 return func(*args, **kwargs)
136 except Exception as e:
File ~/miniforge3/envs/main/lib/python3.10/site-packages/dask/dataframe/io/io.py:351, in from_dict(data, npartitions, orient, dtype, columns, constructor)
320 """
321 Construct a Dask DataFrame from a Python Dictionary
322
(...)
348 >>> ddf = dd.from_dict({"num1": [1, 2, 3, 4], "num2": [7, 8, 9, 10]}, npartitions=2)
349 """
--> 351 collection_types = {type(v) for v in data.values() if is_dask_collection(v)}
352 if collection_types:
AttributeError: 'list' object has no attribute 'values'
The above exception was the direct cause of the following exception:
AttributeError Traceback (most recent call last)
Cell In[49], line 1
----> 1 dd.from_dict(data, npartitions=1)
File ~/miniforge3/envs/main/lib/python3.10/site-packages/dask/backends.py:137, in CreationDispatch.register_inplace.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
135 return func(*args, **kwargs)
136 except Exception as e:
--> 137 raise type(e)(
138 f"An error occurred while calling the {funcname(func)} "
139 f"method registered to the {self.backend} backend.\n"
140 f"Original Message: {e}"
141 ) from e
AttributeError: An error occurred while calling the from_dict method registered to the pandas backend.
Original Message: 'list' object has no attribute 'values'
If I use try passing one item from the list i.e. a dictionary I get you must pass an index
dd.from_dict(data[0], npartitions=1)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
File ~/miniforge3/envs/main/lib/python3.10/site-packages/dask/backends.py:135, in CreationDispatch.register_inplace.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
134 try:
--> 135 return func(*args, **kwargs)
136 except Exception as e:
File ~/miniforge3/envs/main/lib/python3.10/site-packages/dask/dataframe/io/io.py:359, in from_dict(data, npartitions, orient, dtype, columns, constructor)
353 raise NotImplementedError(
354 "from_dict doesn't currently support Dask collections as inputs. "
355 f"Objects of type {collection_types} were given in the input dict."
356 )
358 return from_pandas(
--> 359 constructor.from_dict(data, orient, dtype, columns),
360 npartitions,
361 )
File ~/miniforge3/envs/main/lib/python3.10/site-packages/pandas/core/frame.py:1677, in DataFrame.from_dict(cls, data, orient, dtype, columns)
1676 if orient != "tight":
-> 1677 return cls(data, index=index, columns=columns, dtype=dtype)
1678 else:
File ~/miniforge3/envs/main/lib/python3.10/site-packages/pandas/core/frame.py:636, in DataFrame.__init__(self, data, index, columns, dtype, copy)
634 elif isinstance(data, dict):
635 # GH#38939 de facto copy defaults to False only in non-dict cases
--> 636 mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
637 elif isinstance(data, ma.MaskedArray):
File ~/miniforge3/envs/main/lib/python3.10/site-packages/pandas/core/internals/construction.py:502, in dict_to_mgr(data, index, columns, dtype, typ, copy)
500 # TODO: can we get rid of the dt64tz special case above?
--> 502 return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
File ~/miniforge3/envs/main/lib/python3.10/site-packages/pandas/core/internals/construction.py:120, in arrays_to_mgr(arrays, columns, index, dtype, verify_integrity, typ, consolidate)
119 if index is None:
--> 120 index = _extract_index(arrays)
121 else:
File ~/miniforge3/envs/main/lib/python3.10/site-packages/pandas/core/internals/construction.py:664, in _extract_index(data)
663 if not indexes and not raw_lengths:
--> 664 raise ValueError("If using all scalar values, you must pass an index")
666 elif have_series:
ValueError: If using all scalar values, you must pass an index
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
Cell In[54], line 1
----> 1 dd.from_dict(data[0], npartitions=1)
File ~/miniforge3/envs/main/lib/python3.10/site-packages/dask/backends.py:137, in CreationDispatch.register_inplace.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
135 return func(*args, **kwargs)
136 except Exception as e:
--> 137 raise type(e)(
138 f"An error occurred while calling the {funcname(func)} "
139 f"method registered to the {self.backend} backend.\n"
140 f"Original Message: {e}"
141 ) from e
ValueError: An error occurred while calling the from_dict method registered to the pandas backend.
Original Message: If using all scalar values, you must pass an index
Would appreciated understanding these error messages.
- How do I pass an index to get `dd.from_dict({“a”: 1}, nparitions=1) working?
- Is from_dict the best method to use to read a list of dicts? I would prefer not to use delayed as I know the partition size I would like to like apply.