def calc_diff_per_port(x):
x_sorted = x.sort_values(by='timestamp').reset_index(drop=True)
x_diff = x_sorted[['timestamp', 'PortXmitDataExtended','PortRcvDataExtended']].diff(axis=0)
x_diff = x_diff.rename(columns={'timestamp': 'timedelta',
'PortXmitDataExtended':'xmit_data',
'PortRcvDataExtended':'rcv_data'})
# join time axis
x_diff = pd.concat([x_diff, x_sorted['timestamp']], axis=1)
# remove first entry as it is null
x_diff = x_diff.iloc[1:]
# compute total secodnds
x_diff['total_seconds'] = x_diff['timedelta'].dt.total_seconds()
# compute rate
rate = x_diff[['xmit_data','rcv_data']].divide(x_diff['total_seconds'], axis=0)
# change units from byte to gbps
rate = rate / 2**30 * 8
rate.columns = ['xmit_gbps', 'rcv_gbps']
# concat rate with diffs
x_concat = pd.concat([x_diff, rate], axis=1)
return x_concat
meta = {'timedelta': dtype('<m8[ns]'),
'xmit_data': dtype('float64'),
'rcv_data': dtype('float64'),
'timestamp': dtype('<M8[ns]'),
'total_seconds': dtype('float64'),
'xmit_gbps': dtype('float64'),
'rcv_gbps': dtype('float64')}
port_grp = port_data.groupby(['NodeGUID','PortNumber'])
port_rate = port_grp.apply(calc_diff_per_port, meta=meta)
'NodeGUID' in port_rate.reset_index().columns
> False
see similar issue: Losing columns in Dask group by expression - Stack Overflow
I wasn’t not able to produce minimal example