Skip to content

How to use the library?

1. Experiment Setup

(a) Codes

The current codes directly supprot training CNN7, ResNet20 and VGG16 on CIFAR-10 dataset as well as ResNet50 on ImageNet. To train a new model on a different dataset:

Define a new model:

  • define the new model in models.py
  • add new lines intoget_model(args) function in local_tools.py.

Define a new dataset:

  • add new lines into get_data_loader(args) function in local_tools.py.

Notice that LSGD/EASGD is a distributed training optimizer and the underlying optimizer could be swtiched to a differetn one (defualt: Nesterov-momentum SGD)

Switch the local optimizer:

  • Replace the default optimizer by modifying self.local_optimizer at lines 23 ~ 25 in worker_stoc_optim.py

(b) Bash

#!/bin/bash 
DATE=`date +%Y-%m-%d`
dataset='cifar'
model='resnet20'
m=0.9
minutes=12
batch_size=128
datadir={'dataset directory'}
num_groups={'number of groups'}

if [[ $num_groups -eq 1 ]]
then
  cur_group=0
  ip_addr={'ip address of current node'}
else
  cur_group={'current group'}
  ip_addr={'ip address of first node'}
fi

2. Supported Optimizers

(a) Distributed Training Optimizers

LSGD

dist_op='LSGD'
check_dir="LSGD-$dataset-$model-w$num_groups-$cur_group-$DATE"
c1=0.1; c2=0.1; p1=0.1; p2=0.1
for g_comm in 4 16 64; do
  for lr in 1e-1 1e-2 1e-3; do
    l_comm=$(expr $g_comm / 4)
    avg_size=$l_comm
    exp_name="$dist_op-lr-$lr-l_comm=$l_comm=g_comm=$g_comm=c1-$c1-c2-$c2-p1-$p1-p2-$p2-m-$m-a-$avg_size="

    echo "STARTING LSGD-lr=$lr(l_comm=$l_comm, g_comm=$g_comm)"
    python ../codes/main.py --distributed --l_comm $l_comm --g_comm $g_comm \
      --dist_optimizer $dist_op --datadir $datadir --dataset $dataset --model $model \
      --batch_size $batch_size --lr $lr --c1 $c1 --c2 $c2 --p1 $p1 --p2 $p2 --mom $m --avg_size $avg_size \
      --dist_ip $ip_addr --dist_port 2432 --num_groups $num_groups --cur_group $cur_group \
      --exp_name $exp_name --checkpoints_dir $check_dir  --minutes $minutes
    echo "FINISHING LSGD-lr=$lr(l_comm=$l_comm, g_comm=$g_comm)"
  done
done

EASGD

dist_op='EASGD'
check_dir="EASGD-$dataset-$model-w$num_groups-$cur_group-$DATE"
c1=0.0; c2=0.43; p1=0.0; p2=0.0
for g_comm in 4 16 64; do
    for lr in 1e-1 1e-2 1e-3; do
    l_comm=$(expr $g_comm)
    exp_name="$dist_op-lr-$lr-comm=$g_comm=c2-$c2-m-$m"
    echo "STARTING $dist_op-lr=$lr(comm=$g_comm)"
    python ../codes/main.py --distributed --l_comm $l_comm --g_comm $g_comm \
      --dist_optimizer $dist_op --datadir $datadir --dataset $dataset --model $model \
      --batch_size $batch_size --lr $lr --c1 $c1 --c2 $c2 --p1 $p1 --p2 $p2 --mom $m \
      --dist_ip $ip_addr --dist_port 2432 --num_groups $num_groups --cur_group $cur_group \
      --exp_name $exp_name --checkpoints_dir $check_dir  --minutes $minutes
    echo "FINISHING $dist_op-lr=$lr(comm=$g_comm)"
  done
done

DataParallel

dist_op='DataParallel'
check_dir="DataParallel-$dataset-$model-w$num_groups-$cur_group-$DATE"
g_comm=1
for lr in 1e-1 1e-2 1e-3; do
    exp_name="DataParallel-lr-$lr-m-$m-b-$batch_size-comm=$g_comm="
    echo "STARTING DataParallel-lr=$lr(comm=$g_comm)"
    python ../codes/data_parallel.py --datadir $datadir --dataset $dataset --model $model \
      --optimizer 'SGD' --batch_size $batch_size --lr $lr --mom $m --dist_ip $ip_addr --num_groups $num_groups --cur_group $cur_group \
      --exp_name $exp_name --checkpoints_dir $check_dir  --minutes $minutes
    echo "FINISHING DataParallel-lr=$lr(comm=$g_comm)" 
done

LARS

dist_op='LARS'
check_dir="LARS-$dataset-$model-w$num_groups-$cur_group-$DATE"
g_comm=1
for lr in 1e2 1e1 1e0 1e-1 1e-2; do
    exp_name="LARS-lr-$lr-m-$m-b-$batch_size-comm=$g_comm="
    echo "STARTING LARS-lr=$lr(comm=$g_comm)"
    python ../codes/data_parallel.py --datadir $datadir --dataset $dataset --model $model \
    --optimizer 'LARS' --batch_size $batch_size --lr $lr --mom $m --dist_ip $ip_addr --num_groups $num_groups --cur_group $cur_group \
    --exp_name $exp_name --checkpoints_dir $check_dir  --minutes $minutes
    echo "FINISHING LARS-lr=$lr(comm=$g_comm)" 
done

(b) Sequential Optimizers

SGD

check_dir="SGD-$dataset-$model-$DATE"
for lr in 1e-1 1e-2 1e-3; do
  echo "STARTING SGD-lr=$lr(comm=$g_comm)"
    python ../codes/sgd.py --datadir $datadir --dataset $dataset --model $model \
      --optimizer 'SGD' --batch_size 4 --lr $lr --mom $m \
      --exp_name "SGD-lr-$lr-m-$m-b-4" --checkpoints_dir $check_dir --minutes $minutes \
      --gpu 0 &

    python ../codes/sgd.py --datadir $datadir --dataset $dataset --model $model \
      --optimizer 'SGD' --batch_size 16 --lr $lr --mom $m \
      --exp_name "SGD-lr-$lr-m-$m-b-16" --checkpoints_dir $check_dir --minutes $minutes \
      --gpu 1 &

    python ../codes/sgd.py --datadir $datadir --dataset $dataset --model $model \
      --optimizer 'SGD' --batch_size 64 --lr $lr --mom $m \
      --exp_name "SGD-lr-$lr-m-$m-b-64" --checkpoints_dir $check_dir --minutes $minutes \
      --gpu 2 &

    python ../codes/sgd.py --datadir $datadir --dataset $dataset --model $model \
      --optimizer 'SGD' --batch_size 128 --lr $lr --mom $m \
      --exp_name "SGD-lr-$lr-m-$m-b-128" --checkpoints_dir $check_dir --minutes $minutes \
      --gpu 3 &
  wait
  echo "FINISHING SGD-lr=$lr(comm=$g_comm)" 
done