Starting EC2Cluster with dask_cloudprovider

0

I’m trying to start an EC2Cluster using dask_cloudprovide, i have already done it for fargate and i figured that for EC2 would be similar. Problem: I tried to do it the most basic way possible:

from dask_cloudprovider.aws import EC2Cluster
from dask.distributed import Client
cluster = EC2Cluster(
    n_workers=1, 
)

But even with this basic code i stil get the following output:

Creating scheduler instance 2023-03-22 17:24:14,805 - distributed.deploy.spec - WARNING - Cluster closed without starting up --------------------------------------------------------------------------- ClientError Traceback (most recent call last) File ~/.local/lib/python3.10/site-packages/distributed/deploy/spec.py:320, in SpecCluster._start(self) 319 self.scheduler = cls(**self.scheduler_spec.get(“options”, {})) → 320 self.scheduler = await self.scheduler 321 self.scheduler_comm = rpc( 322 getattr(self.scheduler, “external_address”, None) 323 or self.scheduler.address, 324 connection_args=self.security.get_connection_args(“client”), 325 )

File ~/.local/lib/python3.10/site-packages/distributed/deploy/spec.py:68, in ProcessInterface.await…_() 67 if self.status == Status.created: —> 68 await self.start() 69 assert self.status == Status.running

File ~/.local/lib/python3.10/site-packages/dask_cloudprovider/generic/vmcluster.py:88, in SchedulerMixin.start(self) 87 self.cluster._log(“Creating scheduler instance”) —> 88 ip = await self.create_vm() 89 self.address = f"{self.cluster.protocol}://{ip}:{self.port}"

File ~/.local/lib/python3.10/site-packages/dask_cloudprovider/aws/ec2.py:164, in EC2Instance.create_vm(self) 162 vm_kwargs[“Placement”] = {“AvailabilityZone”: self.availability_zone} → 164 response = await client.run_instances(**vm_kwargs) 165 [self.instance] = response[“Instances”]

File ~/.local/lib/python3.10/site-packages/aiobotocore/client.py:371, in AioBaseClient._make_api_call(self, operation_name, api_params) 370 error_class = self.exceptions.from_code(error_code) → 371 raise error_class(parsed_response, operation_name) 372 else:

ClientError: An error occurred (InvalidParameterValue) when calling the RunInstances operation: User data is limited to 16384 bytes

During handling of the above exception, another exception occurred:

TypeError Traceback (most recent call last) Cell In[8], line 1 ----> 1 cluster = EC2Cluster( 2 n_workers=1, 3 )

File ~/.local/lib/python3.10/site-packages/dask_cloudprovider/aws/ec2.py:605, in EC2Cluster.init(self, region, availability_zone, bootstrap, auto_shutdown, ami, instance_type, scheduler_instance_type, worker_instance_type, vpc, subnet_id, security_groups, filesystem_size, key_name, iam_instance_profile, docker_image, debug, instance_tags, volume_tags, use_private_ip, enable_detailed_monitoring, **kwargs) 603 self.scheduler_options[“instance_type”] = self.scheduler_instance_type 604 self.worker_options[“instance_type”] = self.worker_instance_type → 605 super().init(debug=debug, **kwargs)

File ~/.local/lib/python3.10/site-packages/dask_cloudprovider/generic/vmcluster.py:297, in VMCluster.init(self, n_workers, worker_class, worker_options, scheduler_options, docker_image, docker_args, extra_bootstrap, env_vars, security, protocol, debug, **kwargs) 294 self.worker_options[“extra_bootstrap”] = extra_bootstrap 295 self.uuid = str(uuid.uuid4())[:8] → 297 super().init(**kwargs, security=self.security)

File ~/.local/lib/python3.10/site-packages/distributed/deploy/spec.py:286, in SpecCluster.init(self, workers, scheduler, worker, asynchronous, loop, security, silence_logs, name, shutdown_on_close, scheduler_sync_interval) 284 if not called_from_running_loop: 285 self._loop_runner.start() → 286 self.sync(self._start) 287 try: 288 self.sync(self._correct_state)

File ~/.local/lib/python3.10/site-packages/distributed/utils.py:345, in SyncMethodMixin.sync(self, func, asynchronous, callback_timeout, *args, **kwargs) 343 return future 344 else: → 345 return sync( 346 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs 347 )

File ~/.local/lib/python3.10/site-packages/distributed/utils.py:412, in sync(loop, func, callback_timeout, *args, **kwargs) 410 if error: 411 typ, exc, tb = error → 412 raise exc.with_traceback(tb) 413 else: 414 return result

File ~/.local/lib/python3.10/site-packages/distributed/utils.py:385, in sync…f() 383 future = wait_for(future, callback_timeout) 384 future = asyncio.ensure_future(future) → 385 result = yield future 386 except Exception: 387 error = sys.exc_info()

File ~/.local/lib/python3.10/site-packages/tornado/gen.py:769, in Runner.run(self) 766 exc_info = None 768 try: → 769 value = future.result() 770 except Exception: 771 exc_info = sys.exc_info()

File ~/.local/lib/python3.10/site-packages/dask_cloudprovider/generic/vmcluster.py:339, in VMCluster._start(self) 329 self.worker_spec = { 330 self._new_worker_name(i): self.new_spec for i in range(self._n_workers) 331 } 333 with warn_on_duration( 334 “10s”, 335 "Creating your cluster is taking a surprisingly long time. " 336 "This is likely due to pending resources. " 337 "Hang tight! ", 338 ): → 339 await super()._start()

File ~/.local/lib/python3.10/site-packages/distributed/deploy/spec.py:329, in SpecCluster._start(self) 327 except Exception as e: # pragma: no cover 328 self.status = Status.failed → 329 await self._close() 330 raise RuntimeError(f"Cluster failed to start: {e}") from e

File ~/.local/lib/python3.10/site-packages/distributed/deploy/spec.py:453, in SpecCluster._close(self) 450 logger.warning(“Cluster closed without starting up”) 452 if self.scheduler: → 453 await self.scheduler.close() 454 for w in self._created: 455 assert w.status in { 456 Status.closing, 457 Status.closed, 458 Status.failed, 459 }, w.status

File ~/.local/lib/python3.10/site-packages/dask_cloudprovider/generic/vmcluster.py:60, in VMInterface.close(self) 58 async def close(self): 59 “”“Destroy a VM.”“” —> 60 await self.destroy_vm() 61 await super().close()

File ~/.local/lib/python3.10/site-packages/dask_cloudprovider/aws/ec2.py:228, in EC2Instance.destroy_vm(self) 223 boto_config = botocore.config.Config(retries=dict(max_attempts=10)) 224 async with self.cluster.boto_session.create_client( 225 “ec2”, region_name=self.region, config=boto_config 226 ) as client: 227 await client.terminate_instances( → 228 InstanceIds=[self.instance[“InstanceId”]], DryRun=False 229 ) 230 self.cluster._log(f"Terminated {self.name} ({self.instance[‘InstanceId’]})")

TypeError: ‘NoneType’ object is not subscriptable

Can anyone tell me what’s wrong because by reading the documentation i don’t see any required parameters, even reading the error it looks like some InstanceId parameter is missing , but there is no such parameter in the dask_cloudprovider documentation.

Hi @digasico,

I think the most relevant error is this one:

The problem is being tracked here:

But it doesn’t look solved yet. Can you confirm @jacobtomlinson?

This is still an ongoing problem.