Difference between revisions of "Working with AWS Parallel Cluster"

From LADCO Wiki
Jump to: navigation, search
(spot pcluster config)
(AWS Parallel Cluster)
Line 6: Line 6:
 
Configure the cluster with a config file:
 
Configure the cluster with a config file:
  
== spot pcluster config ==
+
== [[spot pcluster config]] ==
  
 
  [aws]
 
  [aws]

Revision as of 17:28, 26 March 2021

AWS Parallel Cluster

Configure the cluster with a config file:

spot pcluster config

[aws]
aws_region_name = us-east-2
 
[cluster ladcospot]
vpc_settings = ladco
ebs_settings = data,input,apps
scheduler = slurm
master_instance_type = c4.large
compute_instance_type = r4.16xlarge
master_root_volume_size = 40
cluster_type = spot
base_os = alinux2
key_name = ******
s3_read_resource = arn:aws:s3:::ladco-backup/*
s3_read_write_resource = arn:aws:s3:::ladco-wrf/*
post_install = s3://ladco-backup/post_install_users.sh
disable_hyperthreading = true
custom_ami = ami-0c283443a1ebb5c17
max_queue_size = 100
 
# Create a 10 Tb cold storage I/O directory 
[ebs data]
shared_dir = data
volume_type = sc1
volume_size = 10000
volume_iops = 1500
encrypted = false

# Attach an "apps" volume with pre-loaded software
[ebs apps]
shared_dir = ladco
ebs_volume_id = vol-*****

# Attach an "input" data volume with pre-loaded input data
[ebs input]
shared_dir = input
ebs_volume_id = vol-*****

[vpc public]
master_subnet_id = subnet-******
vpc_id = vpc-******

[global]
update_check = true
sanity_check = true
cluster_template = ladcospot

[aliases]
ssh = ssh -Y {CFN_USER}@{MASTER_IP} {ARGS}

on demand pcluster config

[aws]
aws_region_name = us-east-2
 
[cluster ladcowrf]
vpc_settings = public
ebs_settings = ladcowrf
scheduler = sge
master_instance_type = m4.large
compute_instance_type = m5a.4xlarge
placement = cluster
placement_group = DYNAMIC
master_root_volume_size = 40
cluster_type = ondemand
base_os = alinux
key_name = *****
min_vcpus = 0
max_vcpus = 64
desired_vcpus = 0
# Base AMI for pcluster v2.1.0
custom_ami = ami-0381cb7486cdc973f
 
[ebs ladcowrf]
shared_dir = data
volume_type = gp2
volume_size = 10000
volume_iops = 1500
encrypted = false

[vpc public]
master_subnet_id = subnet-******
vpc_id = vpc-******

[global]
update_check = true
sanity_check = true
cluster_template = ladcowrf

[aliases]
ssh = ssh -Y {CFN_USER}@{MASTER_IP} {ARGS}

Cluster Access

Start the cluster

 pcluster create -c config.spot ladcospot

Log in to the cluster

pcluster ssh ladcospot -i {name of your AWS Key}

Fault Tolerance

Script to monitor pcluster system logs for termination notice, and restart WRF.

#!/bin/bash
CASE=LADCO_2016_WRFv39_YNT_NAM
JSTART=2016095
wrk_dir=/data/apps/WRFV3.9.1/sims/LADCO_2016_WRFv39_YNT_NAM
while true
do
  if [ -z $(curl -s http://169.254.169.254/latest/meta-data/spot/termination-time | grep -q .*T.*Z ) ]; then
     echo "terminated"
     break
  else
     echo "Still running fine"
     sleep 3
  fi
done
echo "Restarting WRF job"
#qsub -N WRF_rest $wrk_dir/wrapper_restart_wrf.csh $JSTART 6

~