I just want to convert the coordinate system from lon/lat to XY cood in parallel by using dask dataframe. Following is what I did.
import h3, os, s3fs
import numpy as np
import xarray as xr
import pandas as pd
import geopandas as gpd
import dask.dataframe as dd
import matplotlib.pyplot as plt
from shapely.geometry import box, mapping, Point
from shapely.ops import transform
def domain_mapping(lon_max, lon_min, lat_max, lat_min):
b_map = box(lon_min, lat_min, lon_max, lat_max, ccw=True)
b_map = transform(lambda x, y: (y, x), b_map)
return mapping(b_map)
def hexgrid(b_map, hex_res=12):
hex_col = 'hex' + str(hex_res)
target_df = pd.DataFrame(h3.polyfill( b_map, hex_res), columns=[hex_col])
target_df['lat'] = target_df[hex_col].apply(lambda x: h3.h3_to_geo(x)[0])
target_df['lon'] = target_df[hex_col].apply(lambda x: h3.h3_to_geo(x)[1])
target_df['geometry'] = target_df.apply(lambda row: Point(row["lon"],
row["lat"]), axis=1)
return target_df
def to_xy(df, espg=32723):
return gpd.GeoDataFrame(df).set_crs(4236).to_crs(espg)
lon_max, lon_min = -75.5, -75.6
lat_max, lat_min = -31.2, -31.3
b_map = domain_mapping(lon_max, lon_min, lat_max, lat_min)
pd_df = hexgrid(b_map)
dd_df = dd.from_pandas(pd_df, npartitions=1000)
#-- Using pandas/geopandas directly
#-- taking 21.8 s ± 56.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
gdf = to_xy(pd_df)
#-- Using dd.dataframe/geopandas
#-- taking 4min 49s ± 560 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
@delayed
def to_ddxy(df, espg=32723):
gdf = gpd.GeoDataFrame(df).set_crs(4236).to_crs(espg)
df['xc'] = gdf['geometry'].map(lambda p: p.x)
df['yc'] = gdf['geometry'].map(lambda p: p.y)
return dd.from_pandas(df.drop('geometry',axis=1), npartitions=1)
abc = delayed(dd.concat)([to_ddxy(df) for df in dd_df.to_delayed()]).compute()
Could anyone tell why my parallel process is way slower than the straight forward method?
Thanks