How to use the library?¶
1. Experiment Setup¶
(a) Codes¶
The current codes directly supprot training CNN7, ResNet20 and VGG16 on CIFAR-10 dataset as well as ResNet50 on ImageNet. To train a new model on a different dataset:
Define a new model:
- define the new model in models.py
- add new lines into
get_model(args)
function in local_tools.py.
Define a new dataset:
- add new lines into
get_data_loader(args)
function in local_tools.py.
Notice that LSGD/EASGD is a distributed training optimizer and the underlying optimizer could be swtiched to a differetn one (defualt: Nesterov-momentum SGD)
Switch the local optimizer:
- Replace the default optimizer by modifying
self.local_optimizer
at lines 23 ~ 25 in worker_stoc_optim.py
(b) Bash¶
#!/bin/bash
DATE=`date +%Y-%m-%d`
dataset='cifar'
model='resnet20'
m=0.9
minutes=12
batch_size=128
datadir={'dataset directory'}
num_groups={'number of groups'}
if [[ $num_groups -eq 1 ]]
then
cur_group=0
ip_addr={'ip address of current node'}
else
cur_group={'current group'}
ip_addr={'ip address of first node'}
fi
2. Supported Optimizers¶
(a) Distributed Training Optimizers¶
LSGD¶
dist_op='LSGD'
check_dir="LSGD-$dataset-$model-w$num_groups-$cur_group-$DATE"
c1=0.1; c2=0.1; p1=0.1; p2=0.1
for g_comm in 4 16 64; do
for lr in 1e-1 1e-2 1e-3; do
l_comm=$(expr $g_comm / 4)
avg_size=$l_comm
exp_name="$dist_op-lr-$lr-l_comm=$l_comm=g_comm=$g_comm=c1-$c1-c2-$c2-p1-$p1-p2-$p2-m-$m-a-$avg_size="
echo "STARTING LSGD-lr=$lr(l_comm=$l_comm, g_comm=$g_comm)"
python ../codes/main.py --distributed --l_comm $l_comm --g_comm $g_comm \
--dist_optimizer $dist_op --datadir $datadir --dataset $dataset --model $model \
--batch_size $batch_size --lr $lr --c1 $c1 --c2 $c2 --p1 $p1 --p2 $p2 --mom $m --avg_size $avg_size \
--dist_ip $ip_addr --dist_port 2432 --num_groups $num_groups --cur_group $cur_group \
--exp_name $exp_name --checkpoints_dir $check_dir --minutes $minutes
echo "FINISHING LSGD-lr=$lr(l_comm=$l_comm, g_comm=$g_comm)"
done
done
EASGD¶
dist_op='EASGD'
check_dir="EASGD-$dataset-$model-w$num_groups-$cur_group-$DATE"
c1=0.0; c2=0.43; p1=0.0; p2=0.0
for g_comm in 4 16 64; do
for lr in 1e-1 1e-2 1e-3; do
l_comm=$(expr $g_comm)
exp_name="$dist_op-lr-$lr-comm=$g_comm=c2-$c2-m-$m"
echo "STARTING $dist_op-lr=$lr(comm=$g_comm)"
python ../codes/main.py --distributed --l_comm $l_comm --g_comm $g_comm \
--dist_optimizer $dist_op --datadir $datadir --dataset $dataset --model $model \
--batch_size $batch_size --lr $lr --c1 $c1 --c2 $c2 --p1 $p1 --p2 $p2 --mom $m \
--dist_ip $ip_addr --dist_port 2432 --num_groups $num_groups --cur_group $cur_group \
--exp_name $exp_name --checkpoints_dir $check_dir --minutes $minutes
echo "FINISHING $dist_op-lr=$lr(comm=$g_comm)"
done
done
DataParallel¶
dist_op='DataParallel'
check_dir="DataParallel-$dataset-$model-w$num_groups-$cur_group-$DATE"
g_comm=1
for lr in 1e-1 1e-2 1e-3; do
exp_name="DataParallel-lr-$lr-m-$m-b-$batch_size-comm=$g_comm="
echo "STARTING DataParallel-lr=$lr(comm=$g_comm)"
python ../codes/data_parallel.py --datadir $datadir --dataset $dataset --model $model \
--optimizer 'SGD' --batch_size $batch_size --lr $lr --mom $m --dist_ip $ip_addr --num_groups $num_groups --cur_group $cur_group \
--exp_name $exp_name --checkpoints_dir $check_dir --minutes $minutes
echo "FINISHING DataParallel-lr=$lr(comm=$g_comm)"
done
LARS¶
dist_op='LARS'
check_dir="LARS-$dataset-$model-w$num_groups-$cur_group-$DATE"
g_comm=1
for lr in 1e2 1e1 1e0 1e-1 1e-2; do
exp_name="LARS-lr-$lr-m-$m-b-$batch_size-comm=$g_comm="
echo "STARTING LARS-lr=$lr(comm=$g_comm)"
python ../codes/data_parallel.py --datadir $datadir --dataset $dataset --model $model \
--optimizer 'LARS' --batch_size $batch_size --lr $lr --mom $m --dist_ip $ip_addr --num_groups $num_groups --cur_group $cur_group \
--exp_name $exp_name --checkpoints_dir $check_dir --minutes $minutes
echo "FINISHING LARS-lr=$lr(comm=$g_comm)"
done
(b) Sequential Optimizers¶
SGD¶
check_dir="SGD-$dataset-$model-$DATE"
for lr in 1e-1 1e-2 1e-3; do
echo "STARTING SGD-lr=$lr(comm=$g_comm)"
python ../codes/sgd.py --datadir $datadir --dataset $dataset --model $model \
--optimizer 'SGD' --batch_size 4 --lr $lr --mom $m \
--exp_name "SGD-lr-$lr-m-$m-b-4" --checkpoints_dir $check_dir --minutes $minutes \
--gpu 0 &
python ../codes/sgd.py --datadir $datadir --dataset $dataset --model $model \
--optimizer 'SGD' --batch_size 16 --lr $lr --mom $m \
--exp_name "SGD-lr-$lr-m-$m-b-16" --checkpoints_dir $check_dir --minutes $minutes \
--gpu 1 &
python ../codes/sgd.py --datadir $datadir --dataset $dataset --model $model \
--optimizer 'SGD' --batch_size 64 --lr $lr --mom $m \
--exp_name "SGD-lr-$lr-m-$m-b-64" --checkpoints_dir $check_dir --minutes $minutes \
--gpu 2 &
python ../codes/sgd.py --datadir $datadir --dataset $dataset --model $model \
--optimizer 'SGD' --batch_size 128 --lr $lr --mom $m \
--exp_name "SGD-lr-$lr-m-$m-b-128" --checkpoints_dir $check_dir --minutes $minutes \
--gpu 3 &
wait
echo "FINISHING SGD-lr=$lr(comm=$g_comm)"
done