I am always getting a Value Error whenever I run the below Python code to do aggregation of a Dask Dataframe using custom functions
Can someone please help point out what I could be doing wrong here?
Thank you
import dask.dataframe as dd
import pandas as pd
# Sample DataFrame
df = dd.from_pandas(pd.DataFrame({
'group': ['A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'C'],
'value': [1, 'two', 2, 'three', 3, 'one', 2, 3, 'four']
}), npartitions=2)
# Custom aggregation class
class CountCondition:
def __init__(self, condition):
self.condition = condition
def chunk(self, s):
return s.apply(self.condition).sum()
def agg(self, chunks):
return chunks.sum()
def finalize(self, x):
return x
# Condition function to count integers
def condition_integers(x):
return isinstance(x, int)
# Condition function to count strings
def condition_strings(x):
return isinstance(x, str)
# Create instances of the custom aggregation class
count_integers = CountCondition(condition_integers)
count_strings = CountCondition(condition_strings)
# Create Aggregations
agg_integers = dd.Aggregation(
name='count_integers',
chunk=count_integers.chunk,
agg=count_integers.agg,
finalize=count_integers.finalize
)
agg_strings = dd.Aggregation(
name='count_strings',
chunk=count_strings.chunk,
agg=count_strings.agg,
finalize=count_strings.finalize
)
# Perform groupby and apply custom aggregations
result_integers = df.groupby('group').agg({'value': agg_integers})
result_strings = df.groupby('group').agg({'value': agg_strings})
# Compute the results
result_integers = result_integers.compute()
result_strings = result_strings.compute()
print(result_integers)
print(result_strings)