From 939e303f1d0b2ed49484fcf4e505a913e333b9c2 Mon Sep 17 00:00:00 2001
From: Hang Zhang <zhang.hang@rutgers.edu>
Date: Fri, 24 Apr 2020 19:02:29 -0700
Subject: [PATCH] [WIP] ImageNet training with mxnet gluon (#33)

---
 README.md                    | 22 ++++++++++++++++--
 ablation.md                  |  4 ++--
 resnest/gluon/model_store.py |  4 ++--
 resnest/torch/ablation.py    |  4 ++--
 resnest/transforms.py        |  2 +-
 scripts/gluon/README.md      | 44 ++++++++++++++++++++++++++++++++++++
 scripts/gluon/verify.py      |  2 +-
 7 files changed, 72 insertions(+), 10 deletions(-)
 create mode 100644 scripts/gluon/README.md
diff --git a/README.md b/README.md
index 76dfff5..eecab83 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,8 @@
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
 [![Unit Test](https://github.com/zhanghang1989/ResNeSt/workflows/Unit%20Test/badge.svg)](https://github.com/zhanghang1989/ResNeSt/actions)
 
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/resnest-split-attention-networks/semantic-segmentation-on-ade20k)](https://paperswithcode.com/sota/semantic-segmentation-on-ade20k?p=resnest-split-attention-networks)
+
 # ResNeSt
 Split-Attention Network, A New ResNet Variant. It significantly boosts the performance of downstream models such as Mask R-CNN, Cascade R-CNN and DeepLabV3.
 
@@ -198,7 +200,7 @@ Training code and pretrained models are released at our [Detectron2 Fork](https:
     <th class="tg-cly1">mIoU%</th>
   </tr>
   <tr>
-    <td rowspan="4" class="tg-cly1">Deeplab-V3<br></td>
+    <td rowspan="5" class="tg-cly1">Deeplab-V3<br></td>
     <td class="tg-cly1">ResNet-50</td>
     <td class="tg-cly1">80.39</td>
     <td class="tg-cly1">42.1</td>
@@ -218,6 +220,11 @@ Training code and pretrained models are released at our [Detectron2 Fork](https:
     <td class="tg-0lax"><b>82.07</td>
     <td class="tg-0lax"><b>46.91</b></td>
   </tr>
+  <tr>
+    <td class="tg-0lax">ResNeSt-269 (<span style="color:red">ours</span>)</td>
+    <td class="tg-0lax"><b>82.62</td>
+    <td class="tg-0lax"><b>47.60</b></td>
+  </tr>
 </table>
 
 
@@ -253,9 +260,20 @@ python verify.py --model resnest50 --crop-size 224
 
 ## How to Train
 
-- Training with Gluon: Please visit [GluonCV Toolkit](https://gluon-cv.mxnet.io/model_zoo/classification.html#resnest).
+### ImageNet Models
+
+- Training with MXNet Gluon: Please visit [Gluon folder](./scripts/gluon/).
 - Training with PyTorch: Please visit [PyTorch Encoding Toolkit](https://hangzhang.org/PyTorch-Encoding/model_zoo/imagenet.html) (slightly worse than Gluon implementation).
 
+### Detectron Models
+
+For object detection and instance segmentation models, please visit our [detectron2-ResNeSt fork](https://github.com/zhanghang1989/detectron2-ResNeSt).
+
+### Semantic Segmentation
+ 
+- Training with PyTorch: [Encoding Toolkit](https://hangzhang.org/PyTorch-Encoding/model_zoo/segmentation.html).
+- Training with MXNet: [GluonCV Toolkit](https://gluon-cv.mxnet.io/model_zoo/segmentation.html#ade20k-dataset).
+
 ## Reference
 
 **ResNeSt: Split-Attention Networks** [[arXiv](https://arxiv.org/pdf/2004.08955.pdf)]
diff --git a/ablation.md b/ablation.md
index b0db836..125db4f 100644
--- a/ablation.md
+++ b/ablation.md
@@ -6,8 +6,8 @@
 | ResNeSt-50-fast | 2s1x64d | 27.5M | 4.34   | 80.53   | 80.65 |
 | ResNeSt-50-fast | 4s1x64d | 31.9M | 4.35   | 80.76   | 80.90 |
 | ResNeSt-50-fast | 1s2x40d | 25.9M | 4.38   | 80.59   | 80.72 |
-| ResNeSt-50-fast | 2s2x40d | 26.9M | 4.38   | xx.xx   | 80.85 |
-| ResNeSt-50-fast | 4s2x40d | 30.4M | 4.41   | 80.86   | 80.92 |
+| ResNeSt-50-fast | 2s2x40d | 26.9M | 4.38   | 80.61   | 80.84 |
+| ResNeSt-50-fast | 4s2x40d | 30.4M | 4.41   | 81.14   | 81.17 |
 | ResNeSt-50-fast | 1s4x24d | 25.7M | 4.42   | 80.99   | 80.97 |
 
 ### PyTorch Models
diff --git a/resnest/gluon/model_store.py b/resnest/gluon/model_store.py
index 0623735..53a9529 100644
--- a/resnest/gluon/model_store.py
+++ b/resnest/gluon/model_store.py
@@ -24,8 +24,8 @@
     ('85eb779a5e313d74b5e5390dae02aa8082a0f469', 'resnest50_fast_2s1x64d'),
     ('3f215532c6d8e07a10df116309993d4479fc3e4b', 'resnest50_fast_4s1x64d'),
     ('af3514c2ec757a3a9666a75b82f142ed47d55bee', 'resnest50_fast_1s2x40d'),
-    ('d4a7f303531a333d8ad5cf6f73cab84d0a2dd752', 'resnest50_fast_2s2x40d'),
-    ('1a9f15bcd4ffddd793acdea05de01f73b096e614', 'resnest50_fast_4s2x40d'),
+    ('2db13245aa4967cf5e8617cb4911880dd41628a4', 'resnest50_fast_2s2x40d'),
+    ('b24d515797832e02da4da9c8a15effd5e44cfb56', 'resnest50_fast_4s2x40d'),
     ('7318153ddb5e542a20cc6c58192f3c6209cff9ed', 'resnest50_fast_1s4x24d'),
     ]}
 
diff --git a/resnest/torch/ablation.py b/resnest/torch/ablation.py
index d3756a7..2b89e7e 100644
--- a/resnest/torch/ablation.py
+++ b/resnest/torch/ablation.py
@@ -21,8 +21,8 @@
     ('44938639', 'resnest50_fast_2s1x64d'),
     ('f74f3fc3', 'resnest50_fast_4s1x64d'),
     ('32830b84', 'resnest50_fast_1s2x40d'),
-    ('0e48a197', 'resnest50_fast_2s2x40d'),
-    ('59057aca', 'resnest50_fast_4s2x40d'),
+    ('9d126481', 'resnest50_fast_2s2x40d'),
+    ('41d14ed0', 'resnest50_fast_4s2x40d'),
     ('d4a4f76f', 'resnest50_fast_1s4x24d'),
     ]}
 
diff --git a/resnest/transforms.py b/resnest/transforms.py
index 579f5ce..c1c7125 100644
--- a/resnest/transforms.py
+++ b/resnest/transforms.py
@@ -374,7 +374,7 @@ def __call__(self, img):
             if area < self.min_covered * (original_width * original_height):
                 continue
             if width == original_width and height == original_height:
-                return self._fallback(img)      # https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/preprocessing.py#L102
+                return self._fallback(img)
 
             x = random.randint(0, original_width - width)
             y = random.randint(0, original_height - height)
diff --git a/scripts/gluon/README.md b/scripts/gluon/README.md
new file mode 100644
index 0000000..3362011
--- /dev/null
+++ b/scripts/gluon/README.md
@@ -0,0 +1,44 @@
+## Train ResNeSt with MXNet Gluon
+
+For training with PyTorch, please visit [PyTorch Encoding Toolkit](https://hangzhang.org/PyTorch-Encoding/model_zoo/imagenet.html)
+
+### Install MXNet with Horovod
+
+```bash
+# assuming you have CUDA 10.0 on your machine
+pip install mxnet-cu100
+HOROVOD_GPU_ALLREDUCE=NCCL pip install -v --no-cache-dir horovod
+pip install --no-cache mpi4py
+```
+
+### Prepare ImageNet recordio data format
+
+- Unfortunately ,this is required for training using MXNet Gluon. Please follow the [GluonCV tutorial](https://gluon-cv.mxnet.io/build/examples_datasets/recordio.html) to prepare the data.
+- Copy the data into ramdisk (optional):
+	
+	```
+	cd ~/
+	sudo mkdir -p /media/ramdisk
+	sudo mount -t tmpfs -o size=200G tmpfs /media/ramdisk
+	cp -r /home/ubuntu/data/ILSVRC2012/ /media/ramdisk
+	```
+
+### Training command
+
+Using ResNeSt-50 as the target model:
+
+```bash
+horovodrun -np 64 --hostfile hosts python train.py \
+--rec-train /media/ramdisk/ILSVRC2012/train.rec \
+--rec-val /media/ramdisk/ILSVRC2012/val.rec \
+--model resnest50 --lr 0.05 --num-epochs 270 --batch-size 128 \
+--use-rec --dtype float32 --warmup-epochs 5 --last-gamma --no-wd \
+--label-smoothing --mixup --save-dir params_ resnest50 \
+--log-interval 50 --eval-frequency 5 --auto_aug --input-size 224
+```
+
+### Verify pretrained model
+
+```bash
+python verify.py --model resnest50 --crop-size 224 --resume params_ resnest50/imagenet-resnest50-269.params
+```
\ No newline at end of file
diff --git a/scripts/gluon/verify.py b/scripts/gluon/verify.py
index ab2714c..5e7f7e2 100644
--- a/scripts/gluon/verify.py
+++ b/scripts/gluon/verify.py
@@ -8,7 +8,6 @@
 from mxnet.gluon.data.vision import transforms
 from mxnet.contrib.quantization import *
 
-from gluoncv.data import imagenet
 from resnest.gluon import get_model
 
 from PIL import Image
@@ -136,6 +135,7 @@ def test(network, ctx, val_data, batch_fn):
         ])
 
     if not opt.rec_dir:
+        from gluoncv.data import imagenet
         val_data = gluon.data.DataLoader(
             imagenet.classification.ImageNet(opt.data_dir, train=False).transform_first(transform_test),
             batch_size=batch_size, shuffle=False, num_workers=num_workers)