Difference between revisions of "Working with AWS Parallel Cluster"

From LADCO Wiki
Jump to: navigation, search
(Created page with "= AWS Parallel Cluster = * [https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-install.html Install AWS-CLI] * [https://aws.amazon.com/blogs/opensource/aws-parallelclus...")
 
(spot pcluster config)
Line 12: Line 12:
 
    
 
    
 
  [cluster ladcospot]
 
  [cluster ladcospot]
  vpc_settings = public
+
  vpc_settings = ladco
  ebs_settings = ladcosc1
+
  ebs_settings = data,input,apps
  scheduler = sge
+
  scheduler = slurm
 
  master_instance_type = c4.large
 
  master_instance_type = c4.large
  compute_instance_type = c4.4xlarge
+
  compute_instance_type = r4.16xlarge
placement = computer
 
placement_group = DYNAMIC
 
 
  master_root_volume_size = 40
 
  master_root_volume_size = 40
 
  cluster_type = spot
 
  cluster_type = spot
#spot_price = 0.2
+
  base_os = alinux2
  base_os = alinux
+
  key_name = ******
  key_name = *****
+
s3_read_resource = arn:aws:s3:::ladco-backup/*
  # Base AMI for pcluster v2.1.0
+
s3_read_write_resource = arn:aws:s3:::ladco-wrf/*
  custom_ami = ami-0381cb7486cdc973f
+
  post_install = s3://ladco-backup/post_install_users.sh
 +
disable_hyperthreading = true
 +
  custom_ami = ami-0c283443a1ebb5c17
 +
max_queue_size = 100
 
    
 
    
  # Create a cold storage I/O directory  
+
  # Create a 10 Tb cold storage I/O directory  
  [ebs ladcosc1]
+
  [ebs data]
 
  shared_dir = data
 
  shared_dir = data
 
  volume_type = sc1
 
  volume_type = sc1
Line 34: Line 35:
 
  volume_iops = 1500
 
  volume_iops = 1500
 
  encrypted = false
 
  encrypted = false
   
+
 
 +
  # Attach an "apps" volume with pre-loaded software
 +
[ebs apps]
 +
shared_dir = ladco
 +
ebs_volume_id = vol-*****
 +
 
 +
# Attach an "input" data volume with pre-loaded input data
 +
[ebs input]
 +
shared_dir = input
 +
ebs_volume_id = vol-*****
 +
 
 
  [vpc public]
 
  [vpc public]
 
  master_subnet_id = subnet-******
 
  master_subnet_id = subnet-******

Revision as of 17:27, 26 March 2021

AWS Parallel Cluster

Configure the cluster with a config file:

spot pcluster config

[aws]
aws_region_name = us-east-2
 
[cluster ladcospot]
vpc_settings = ladco
ebs_settings = data,input,apps
scheduler = slurm
master_instance_type = c4.large
compute_instance_type = r4.16xlarge
master_root_volume_size = 40
cluster_type = spot
base_os = alinux2
key_name = ******
s3_read_resource = arn:aws:s3:::ladco-backup/*
s3_read_write_resource = arn:aws:s3:::ladco-wrf/*
post_install = s3://ladco-backup/post_install_users.sh
disable_hyperthreading = true
custom_ami = ami-0c283443a1ebb5c17
max_queue_size = 100
 
# Create a 10 Tb cold storage I/O directory 
[ebs data]
shared_dir = data
volume_type = sc1
volume_size = 10000
volume_iops = 1500
encrypted = false
# Attach an "apps" volume with pre-loaded software
[ebs apps]
shared_dir = ladco
ebs_volume_id = vol-*****
# Attach an "input" data volume with pre-loaded input data
[ebs input]
shared_dir = input
ebs_volume_id = vol-*****
[vpc public]
master_subnet_id = subnet-******
vpc_id = vpc-******

[global]
update_check = true
sanity_check = true
cluster_template = ladcospot

[aliases]
ssh = ssh -Y {CFN_USER}@{MASTER_IP} {ARGS}

on demand pcluster config

[aws]
aws_region_name = us-east-2
 
[cluster ladcowrf]
vpc_settings = public
ebs_settings = ladcowrf
scheduler = sge
master_instance_type = m4.large
compute_instance_type = m5a.4xlarge
placement = cluster
placement_group = DYNAMIC
master_root_volume_size = 40
cluster_type = ondemand
base_os = alinux
key_name = *****
min_vcpus = 0
max_vcpus = 64
desired_vcpus = 0
# Base AMI for pcluster v2.1.0
custom_ami = ami-0381cb7486cdc973f
 
[ebs ladcowrf]
shared_dir = data
volume_type = gp2
volume_size = 10000
volume_iops = 1500
encrypted = false

[vpc public]
master_subnet_id = subnet-******
vpc_id = vpc-******

[global]
update_check = true
sanity_check = true
cluster_template = ladcowrf

[aliases]
ssh = ssh -Y {CFN_USER}@{MASTER_IP} {ARGS}

Cluster Access

Start the cluster

 pcluster create -c config.spot ladcospot

Log in to the cluster

pcluster ssh ladcospot -i {name of your AWS Key}

Fault Tolerance

Script to monitor pcluster system logs for termination notice, and restart WRF.

#!/bin/bash
CASE=LADCO_2016_WRFv39_YNT_NAM
JSTART=2016095
wrk_dir=/data/apps/WRFV3.9.1/sims/LADCO_2016_WRFv39_YNT_NAM
while true
do
  if [ -z $(curl -s http://169.254.169.254/latest/meta-data/spot/termination-time | grep -q .*T.*Z ) ]; then
     echo "terminated"
     break
  else
     echo "Still running fine"
     sleep 3
  fi
done
echo "Restarting WRF job"
#qsub -N WRF_rest $wrk_dir/wrapper_restart_wrf.csh $JSTART 6

~