hello.
Currently, after converting the xlsx file to a csv file, preprocessing it into a data frame using the Dask library, and then writing and executing the code to create a Pandas data frame again, the following error continues to occur.
Runtime error: cannot schedule new futures after interpreter shutdown
Is there a solution?
def convert_xlsx_to_csv(file_path):
"""Convert an xlsx file to a csv file in the same directory and return the new csv path."""
try:
app = xw.App(visible=False)
wb = app.books.open(file_path)
sheet1 = wb.sheets[0]
csv_file_path = file_path.replace('.xlsx', '.csv')
sheet1.range('A3').expand().options(pd.DataFrame).value.to_csv(csv_file_path, index=False)
return csv_file_path
except Exception as e:
print(f"Fail: {file_path}, Error: {e}")
return None
finally:
wb.close()
app.quit()
def read_and_process_file(file_path):
try:
if file_path.endswith('.csv'):
ddf = dd.read_csv(file_path)
elif file_path.endswith('.xlsx'):
csv_file_path = convert_xlsx_to_csv(file_path)
if csv_file_path:
ddf = dd.read_csv(csv_file_path)
else:
return pd.DataFrame()
else:
return pd.DataFrame()
except Exception as e:
print(f"fail: {file_path}, Error: {e}")
return pd.DataFrame()
try:
df = ddf.compute(scheduler='threads') # 'threads', 'processes', 'synchronous' 중 하나 선택
except Exception as e:
print(f"Fail: {file_path}, Error: {e}")
return pd.DataFrame()
return df