diff --git a/README.md b/README.md
index 19f0cc07..ac5add90 100755
--- a/README.md
+++ b/README.md
@@ -64,6 +64,9 @@ python main.py --mode=evaluate
 * [1] [Photo-Realistic Single Image Super-Resolution Using a Generative Adversarial Network](https://arxiv.org/abs/1609.04802)
 * [2] [Is the deconvolution layer the same as a convolutional layer ?](https://arxiv.org/abs/1609.07009)
 
+### Author
+- [zsdonghao](https://github.com/zsdonghao)
+
 ### License
 
 - For academic and non-commercial use only.
diff --git a/main.py b/main.py
index d9828842..0d1ff6f5 100755
--- a/main.py
+++ b/main.py
@@ -278,8 +278,8 @@ def evaluate():
     # print(valid_lr_img.min(), valid_lr_img.max())
 
     size = valid_lr_img.shape
-    t_image = tf.placeholder('float32', [None, size[0], size[1], size[2]], name='input_image')
-    # t_image = tf.placeholder('float32', [1, None, None, 3], name='input_image')
+    # t_image = tf.placeholder('float32', [None, size[0], size[1], size[2]], name='input_image') # the old version of TL need to specify the image size
+    t_image = tf.placeholder('float32', [1, None, None, 3], name='input_image')
 
     net_g = SRGAN_g(t_image, is_train=False, reuse=False)
 
diff --git a/tensorlayer/__init__.py b/tensorlayer/__init__.py
index bdb2b987..0a45da2e 100644
--- a/tensorlayer/__init__.py
+++ b/tensorlayer/__init__.py
@@ -21,12 +21,13 @@
 from . import prepro
 from . import nlp
 from . import rein
+from . import distributed
 
 # alias
 act = activation
 vis = visualize
 
-__version__ = "1.5.0"
+__version__ = "1.7.3"
 
 global_flag = {}
 global_dict = {}
diff --git a/tensorlayer/activation.py b/tensorlayer/activation.py
index 7b6b6402..a5e38360 100644
--- a/tensorlayer/activation.py
+++ b/tensorlayer/activation.py
@@ -1,7 +1,5 @@
 #! /usr/bin/python
-# -*- coding: utf8 -*-
-
-
+# -*- coding: utf-8 -*-
 
 import tensorflow as tf
 
@@ -13,7 +11,6 @@ def identity(x, name=None):
     x : a tensor input
         input(s)
 
-
     Returns
     --------
     A `Tensor` with the same type as `x`.
@@ -37,14 +34,13 @@ def ramp(x=None, v_min=0, v_max=1, name=None):
     name : a string or None
         An optional name to attach to this activation function.
 
-
     Returns
     --------
     A `Tensor` with the same type as `x`.
     """
     return tf.clip_by_value(x, clip_value_min=v_min, clip_value_max=v_max, name=name)
 
-def leaky_relu(x=None, alpha=0.1, name="LeakyReLU"):
+def leaky_relu(x=None, alpha=0.1, name="lrelu"):
     """The LeakyReLU, Shortcut is ``lrelu``.
 
     Modified version of ReLU, introducing a nonzero gradient for negative
@@ -67,16 +63,33 @@ def leaky_relu(x=None, alpha=0.1, name="LeakyReLU"):
     ------------
     - `Rectifier Nonlinearities Improve Neural Network Acoustic Models, Maas et al. (2013) <http://web.stanford.edu/~awni/papers/relu_hybrid_icml2013_final.pdf>`_
     """
-    with tf.name_scope(name) as scope:
+    # with tf.name_scope(name) as scope:
         # x = tf.nn.relu(x)
         # m_x = tf.nn.relu(-x)
         # x -= alpha * m_x
-        x = tf.maximum(x, alpha * x)
+    x = tf.maximum(x, alpha * x, name=name)
     return x
 
 #Shortcut
 lrelu = leaky_relu
 
+
+def swish(x, name='swish'):
+    """The Swish function, see `Swish: a Self-Gated Activation Function <https://arxiv.org/abs/1710.05941>`_.
+
+    Parameters
+    ----------
+    x : a tensor input
+        input(s)
+
+    Returns
+    --------
+    A `Tensor` with the same type as `x`.
+    """
+    with tf.name_scope(name) as scope:
+        x =  tf.nn.sigmoid(x) * x
+    return x
+
 def pixel_wise_softmax(output, name='pixel_wise_softmax'):
     """Return the softmax outputs of images, every pixels have multiple label, the sum of a pixel is 1.
     Usually be used for image segmentation.
diff --git a/tensorlayer/cost.py b/tensorlayer/cost.py
index 65554b13..941b5c5b 100644
--- a/tensorlayer/cost.py
+++ b/tensorlayer/cost.py
@@ -1,8 +1,7 @@
 #! /usr/bin/python
-# -*- coding: utf8 -*-
-
-
+# -*- coding: utf-8 -*-
 
+import logging
 import tensorflow as tf
 import numbers
 from tensorflow.python.framework import ops
@@ -32,19 +31,19 @@ def cross_entropy(output, target, name=None):
     - About cross-entropy: `wiki <https://en.wikipedia.org/wiki/Cross_entropy>`_.\n
     - The code is borrowed from: `here <https://en.wikipedia.org/wiki/Cross_entropy>`_.
     """
-    try: # old
-        return tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=output, targets=target))
-    except: # TF 1.0
-        assert name is not None, "Please give a unique name to tl.cost.cross_entropy for TF1.0+"
-        return tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output, name=name))
+    # try: # old
+    #     return tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=output, targets=target))
+    # except: # TF 1.0
+    assert name is not None, "Please give a unique name to tl.cost.cross_entropy for TF1.0+"
+    return tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output, name=name))
 
 def sigmoid_cross_entropy(output, target, name=None):
     """It is a sigmoid cross-entropy operation, see ``tf.nn.sigmoid_cross_entropy_with_logits``.
     """
-    try: # TF 1.0
-        return tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output, name=name))
-    except:
-        return tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=output, targets=target))
+    # try: # TF 1.0
+    return tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output, name=name))
+    # except:
+    #     return tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=output, targets=target))
 
 
 def binary_cross_entropy(output, target, epsilon=1e-8, name='bce_loss'):
@@ -76,55 +75,94 @@ def binary_cross_entropy(output, target, epsilon=1e-8, name='bce_loss'):
                               (1. - target) * tf.log(1. - output + epsilon)), axis=1))
 
 
-def mean_squared_error(output, target, is_mean=False):
-    """Return the TensorFlow expression of mean-squre-error of two distributions.
+def mean_squared_error(output, target, is_mean=False, name="mean_squared_error"):
+    """ Return the TensorFlow expression of mean-square-error (L2) of two batch of data.
 
     Parameters
     ----------
-    output : 2D or 4D tensor.
-    target : 2D or 4D tensor.
+    output : 2D, 3D or 4D tensor i.e. [batch_size, n_feature], [batch_size, w, h] or [batch_size, w, h, c].
+    target : 2D, 3D or 4D tensor.
     is_mean : boolean, if True, use ``tf.reduce_mean`` to compute the loss of one data, otherwise, use ``tf.reduce_sum`` (default).
 
     References
     ------------
     - `Wiki Mean Squared Error <https://en.wikipedia.org/wiki/Mean_squared_error>`_
     """
-    with tf.name_scope("mean_squared_error_loss"):
+    with tf.name_scope(name):
         if output.get_shape().ndims == 2:   # [batch_size, n_feature]
             if is_mean:
                 mse = tf.reduce_mean(tf.reduce_mean(tf.squared_difference(output, target), 1))
             else:
                 mse = tf.reduce_mean(tf.reduce_sum(tf.squared_difference(output, target), 1))
+        elif output.get_shape().ndims == 3: # [batch_size, w, h]
+            if is_mean:
+                mse = tf.reduce_mean(tf.reduce_mean(tf.squared_difference(output, target), [1, 2]))
+            else:
+                mse = tf.reduce_mean(tf.reduce_sum(tf.squared_difference(output, target), [1, 2]))
         elif output.get_shape().ndims == 4: # [batch_size, w, h, c]
             if is_mean:
                 mse = tf.reduce_mean(tf.reduce_mean(tf.squared_difference(output, target), [1, 2, 3]))
             else:
                 mse = tf.reduce_mean(tf.reduce_sum(tf.squared_difference(output, target), [1, 2, 3]))
+        else:
+            raise Exception("Unknow dimension")
         return mse
 
 def normalized_mean_square_error(output, target):
-    """Return the TensorFlow expression of normalized mean-squre-error of two distributions.
+    """Return the TensorFlow expression of normalized mean-square-error of two distributions.
 
     Parameters
     ----------
-    output : 2D or 4D tensor.
-    target : 2D or 4D tensor.
+    output : 2D, 3D or 4D tensor i.e. [batch_size, n_feature], [batch_size, w, h] or [batch_size, w, h, c].
+    target : 2D, 3D or 4D tensor.
     """
     with tf.name_scope("mean_squared_error_loss"):
         if output.get_shape().ndims == 2:   # [batch_size, n_feature]
             nmse_a = tf.sqrt(tf.reduce_sum(tf.squared_difference(output, target), axis=1))
             nmse_b = tf.sqrt(tf.reduce_sum(tf.square(target), axis=1))
+        elif output.get_shape().ndims == 3:   # [batch_size, w, h]
+            nmse_a = tf.sqrt(tf.reduce_sum(tf.squared_difference(output, target), axis=[1,2]))
+            nmse_b = tf.sqrt(tf.reduce_sum(tf.square(target), axis=[1,2]))
         elif output.get_shape().ndims == 4: # [batch_size, w, h, c]
             nmse_a = tf.sqrt(tf.reduce_sum(tf.squared_difference(output, target), axis=[1,2,3]))
             nmse_b = tf.sqrt(tf.reduce_sum(tf.square(target), axis=[1,2,3]))
         nmse = tf.reduce_mean(nmse_a / nmse_b)
     return nmse
 
+def absolute_difference_error(output, target, is_mean=False):
+    """ Return the TensorFlow expression of absolute difference error (L1) of two batch of data.
 
-def dice_coe(output, target, epsilon=1e-10):
-    """Sørensen–Dice coefficient for comparing the similarity of two distributions,
-    usually be used for binary image segmentation i.e. labels are binary.
-    The coefficient = [0, 1], 1 if totally match.
+    Parameters
+    ----------
+    output : 2D, 3D or 4D tensor i.e. [batch_size, n_feature], [batch_size, w, h] or [batch_size, w, h, c].
+    target : 2D, 3D or 4D tensor.
+    is_mean : boolean, if True, use ``tf.reduce_mean`` to compute the loss of one data, otherwise, use ``tf.reduce_sum`` (default).
+    """
+    with tf.name_scope("mean_squared_error_loss"):
+        if output.get_shape().ndims == 2:   # [batch_size, n_feature]
+            if is_mean:
+                loss = tf.reduce_mean(tf.reduce_mean(tf.abs(output - target), 1))
+            else:
+                loss = tf.reduce_mean(tf.reduce_sum(tf.abs(output - target), 1))
+        elif output.get_shape().ndims == 3: # [batch_size, w, h]
+            if is_mean:
+                loss = tf.reduce_mean(tf.reduce_mean(tf.abs(output - target), [1, 2]))
+            else:
+                loss = tf.reduce_mean(tf.reduce_sum(tf.abs(output - target), [1, 2]))
+        elif output.get_shape().ndims == 4: # [batch_size, w, h, c]
+            if is_mean:
+                loss = tf.reduce_mean(tf.reduce_mean(tf.abs(output - target), [1, 2, 3]))
+            else:
+                loss = tf.reduce_mean(tf.reduce_sum(tf.abs(output - target), [1, 2, 3]))
+        else:
+            raise Exception("Unknow dimension")
+        return loss
+
+
+def dice_coe(output, target, loss_type='jaccard', axis=[1,2,3], smooth=1e-5):
+    """Soft dice (Sørensen or Jaccard) coefficient for comparing the similarity
+    of two batch of data, usually be used for binary image segmentation
+    i.e. labels are binary. The coefficient between 0 to 1, 1 means totally match.
 
     Parameters
     -----------
@@ -132,35 +170,50 @@ def dice_coe(output, target, epsilon=1e-10):
         A distribution with shape: [batch_size, ....], (any dimensions).
     target : tensor
         A distribution with shape: [batch_size, ....], (any dimensions).
-    epsilon : float
-        An optional name to attach to this layer.
+    loss_type : string
+        ``jaccard`` or ``sorensen``, default is ``jaccard``.
+    axis : list of integer
+        All dimensions are reduced, default ``[1,2,3]``.
+    smooth : float
+        This small value will be added to the numerator and denominator.
+        If both output and target are empty, it makes sure dice is 1.
+        If either output or target are empty (all pixels are background), dice = ```smooth/(small_value + smooth)``,
+        then if smooth is very small, dice close to 0 (even the image values lower than the threshold),
+        so in this case, higher smooth can have a higher dice.
 
     Examples
     ---------
     >>> outputs = tl.act.pixel_wise_softmax(network.outputs)
-    >>> dice_loss = 1 - tl.cost.dice_coe(outputs, y_, epsilon=1e-5)
+    >>> dice_loss = 1 - tl.cost.dice_coe(outputs, y_)
 
     References
     -----------
-    - `wiki-dice <https://en.wikipedia.org/wiki/Sørensen–Dice_coefficient>`_
+    - `Wiki-Dice <https://en.wikipedia.org/wiki/Sørensen–Dice_coefficient>`_
     """
-    # inse = tf.reduce_sum( tf.mul(output, target) )
-    # l = tf.reduce_sum( tf.mul(output, output) )
-    # r = tf.reduce_sum( tf.mul(target, target) )
-    inse = tf.reduce_sum( output * target )
-    l = tf.reduce_sum( output * output )
-    r = tf.reduce_sum( target * target )
-    dice = 2 * (inse) / (l + r)
-    if epsilon == 0:
-        return dice
+    inse = tf.reduce_sum(output * target, axis=axis)
+    if loss_type == 'jaccard':
+        l = tf.reduce_sum(output * output, axis=axis)
+        r = tf.reduce_sum(target * target, axis=axis)
+    elif loss_type == 'sorensen':
+        l = tf.reduce_sum(output, axis=axis)
+        r = tf.reduce_sum(target, axis=axis)
     else:
-        return tf.clip_by_value(dice, 0, 1.0-epsilon)
-
-
-def dice_hard_coe(output, target, epsilon=1e-10):
-    """Non-differentiable Sørensen–Dice coefficient for comparing the similarity of two distributions,
-    usually be used for binary image segmentation i.e. labels are binary.
-    The coefficient = [0, 1], 1 if totally match.
+        raise Exception("Unknow loss_type")
+    ## old axis=[0,1,2,3]
+    # dice = 2 * (inse) / (l + r)
+    # epsilon = 1e-5
+    # dice = tf.clip_by_value(dice, 0, 1.0-epsilon) # if all empty, dice = 1
+    ## new haodong
+    dice = (2. * inse + smooth) / (l + r + smooth)
+    ##
+    dice = tf.reduce_mean(dice)
+    return dice
+
+
+def dice_hard_coe(output, target, threshold=0.5, axis=[1,2,3], smooth=1e-5):
+    """Non-differentiable Sørensen–Dice coefficient for comparing the similarity
+    of two batch of data, usually be used for binary image segmentation i.e. labels are binary.
+    The coefficient between 0 to 1, 1 if totally match.
 
     Parameters
     -----------
@@ -168,32 +221,37 @@ def dice_hard_coe(output, target, epsilon=1e-10):
         A distribution with shape: [batch_size, ....], (any dimensions).
     target : tensor
         A distribution with shape: [batch_size, ....], (any dimensions).
-    epsilon : float
-        An optional name to attach to this layer.
-
-    Examples
-    ---------
-    >>> outputs = pixel_wise_softmax(network.outputs)
-    >>> dice_loss = 1 - dice_coe(outputs, y_, epsilon=1e-5)
+    threshold : float
+        The threshold value to be true.
+    axis : list of integer
+        All dimensions are reduced, default ``[1,2,3]``.
+    smooth : float
+        This small value will be added to the numerator and denominator, see ``dice_coe``.
 
     References
     -----------
-    - `wiki-dice <https://en.wikipedia.org/wiki/Sørensen–Dice_coefficient>`_
+    - `Wiki-Dice <https://en.wikipedia.org/wiki/Sørensen–Dice_coefficient>`_
     """
-    output = tf.cast(output > 0.5, dtype=tf.float32)
-    target = tf.cast(target > 0.5, dtype=tf.float32)
-    inse = tf.reduce_sum( output * target )
-    l = tf.reduce_sum( output * output )
-    r = tf.reduce_sum( target * target )
-    dice = 2 * (inse) / (l + r)
-    if epsilon == 0:
-        return dice
-    else:
-        return tf.clip_by_value(dice, 0, 1.0-epsilon)
-
-def iou_coe(output, target, threshold=0.5, epsilon=1e-10):
-    """Non-differentiable Intersection over Union, usually be used for evaluating binary image segmentation.
-    The coefficient = [0, 1], 1 means totally match.
+    output = tf.cast(output > threshold, dtype=tf.float32)
+    target = tf.cast(target > threshold, dtype=tf.float32)
+    inse = tf.reduce_sum(tf.multiply(output, target), axis=axis)
+    l = tf.reduce_sum(output, axis=axis)
+    r = tf.reduce_sum(target, axis=axis)
+    ## old axis=[0,1,2,3]
+    # hard_dice = 2 * (inse) / (l + r)
+    # epsilon = 1e-5
+    # hard_dice = tf.clip_by_value(hard_dice, 0, 1.0-epsilon)
+    ## new haodong
+    hard_dice = (2. * inse + smooth) / (l + r + smooth)
+    ##
+    hard_dice = tf.reduce_mean(hard_dice)
+    return hard_dice
+
+
+def iou_coe(output, target, threshold=0.5, axis=[1,2,3], smooth=1e-5):
+    """Non-differentiable Intersection over Union (IoU) for comparing the
+    similarity of two batch of data, usually be used for evaluating binary image segmentation.
+    The coefficient between 0 to 1, 1 means totally match.
 
     Parameters
     -----------
@@ -203,23 +261,59 @@ def iou_coe(output, target, threshold=0.5, epsilon=1e-10):
         A distribution with shape: [batch_size, ....], (any dimensions).
     threshold : float
         The threshold value to be true.
-    epsilon : float
-        A small value to avoid zero denominator when both output and target output nothing.
-
-    Examples
-    ---------
-    >>> outputs = tl.act.pixel_wise_softmax(network.outputs)
-    >>> iou = tl.cost.iou_coe(outputs[:,:,:,0], y_[:,:,:,0])
+    axis : list of integer
+        All dimensions are reduced, default ``[1,2,3]``.
+    smooth : float
+        This small value will be added to the numerator and denominator, see ``dice_coe``.
 
     Notes
     ------
-    - IOU cannot be used as training loss, people usually use dice coefficient for training, and IOU for evaluating.
+    - IoU cannot be used as training loss, people usually use dice coefficient for training, IoU and hard-dice for evaluating.
     """
     pre = tf.cast(output > threshold, dtype=tf.float32)
     truth = tf.cast(target > threshold, dtype=tf.float32)
-    intersection = tf.reduce_sum(pre * truth)
-    union = tf.reduce_sum(tf.cast((pre + truth) > threshold, dtype=tf.float32))
-    return tf.reduce_sum(intersection) / (tf.reduce_sum(union) + epsilon)
+    inse = tf.reduce_sum(tf.multiply(pre, truth), axis=axis) # AND
+    union = tf.reduce_sum(tf.cast(tf.add(pre, truth)>= 1, dtype=tf.float32), axis=axis) # OR
+    ## old axis=[0,1,2,3]
+    # epsilon = 1e-5
+    # batch_iou = inse / (union + epsilon)
+    ## new haodong
+    batch_iou = (inse + smooth) / (union + smooth)
+    iou = tf.reduce_mean(batch_iou)
+    return iou#, pre, truth, inse, union
+
+# ## test soft/hard dice and iou
+# import numpy as np
+# y = np.zeros((1,10,10,1))
+# # y[0,0:5,0:5]=1.0
+# o = np.zeros((1,10,10,1))
+# # o[:,:,:,:] = 0            # what we want: dice=0   iou=0  OK
+# # o[0,0:2,0:2]=0.3          # what we want: dice larger iou=0  OK
+# # o[0,0:2,0:2]=0.6          # what we want: dice larger  iou small  OK
+# # o[0,0:3,0:3]=0.6          # what we want: dice larger iou larger OK
+# # o[0,0:3,0:3]=1            # what we want: dice larger iou same OK
+# # o[0,0:5,0:5]=1            # what we want: dice=1 iou=1  OK
+# # o[0,0:5,0:5]=0.3          # what we want: dice smaller  iou=0  OK
+# # o[0,0:5,0:5]=1e-2           # what we want: dice≈0 iou=0  OK
+# # o[0,8:10,8:10]=1.0        # what we want: dice=0 iou=0  OK
+# # o[0,8:10,8:10]=1e-10        # what we want: dice=0 iou=0  OK
+# # y[:,:,:,:] = o[:,:,:,:] = 0 # what we want: dice=1 iou=1  OK
+# ## why in u-net, dice=1 hard-dice=1 iou=1 exist?? print bug?
+#
+# d = dice_coe(o, y, 'jaccard', smooth=1.)
+# hd = dice_hard_coe(o, y, smooth=1e-5)
+# i = iou_coe(o, y, smooth=1e-5)
+# sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
+# # sess.run(tf.local_variables_initializer())
+# print(sess.run([d,hd,i]))
+# # p, t, i, u = sess.run([pre, truth, inse, union])
+# # import pprint
+# # pprint.pprint(((y>0.5)*(o>0.5)).astype(int).tolist())
+# # pprint.pprint(p.tolist())
+# # pprint.pprint(t.tolist())
+# # pprint.pprint(i)
+# # pprint.pprint(u)
+# exit()
 
 
 def cross_entropy_seq(logits, target_seqs, batch_size=None):#, batch_size=1, num_steps=None):
@@ -242,10 +336,10 @@ def cross_entropy_seq(logits, target_seqs, batch_size=None):#, batch_size=1, num
     >>> targets = tf.placeholder(tf.int32, [batch_size, num_steps])
     >>> cost = tl.cost.cross_entropy_seq(network.outputs, targets)
     """
-    try: # TF 1.0
-        sequence_loss_by_example_fn = tf.contrib.legacy_seq2seq.sequence_loss_by_example
-    except:
-        sequence_loss_by_example_fn = tf.nn.seq2seq.sequence_loss_by_example
+    # try: # TF 1.0
+    sequence_loss_by_example_fn = tf.contrib.legacy_seq2seq.sequence_loss_by_example
+    # except:
+    #     sequence_loss_by_example_fn = tf.nn.seq2seq.sequence_loss_by_example
 
     loss = sequence_loss_by_example_fn(
         [logits],
@@ -283,14 +377,14 @@ def cross_entropy_seq_with_mask(logits, target_seqs, input_mask, return_details=
     losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=targets, name=name) * weights
     #losses = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=targets, name=name)) # for TF1.0 and others
 
-    try: ## TF1.0
-        loss = tf.divide(tf.reduce_sum(losses),   # loss from mask. reduce_sum before element-wise mul with mask !!
-                        tf.reduce_sum(weights),
-                        name="seq_loss_with_mask")
-    except: ## TF0.12
-        loss = tf.div(tf.reduce_sum(losses),   # loss from mask. reduce_sum before element-wise mul with mask !!
-                        tf.reduce_sum(weights),
-                        name="seq_loss_with_mask")
+    # try: ## TF1.0
+    loss = tf.divide(tf.reduce_sum(losses),   # loss from mask. reduce_sum before element-wise mul with mask !!
+                    tf.reduce_sum(weights),
+                    name="seq_loss_with_mask")
+    # except: ## TF0.12
+    #     loss = tf.div(tf.reduce_sum(losses),   # loss from mask. reduce_sum before element-wise mul with mask !!
+    #                     tf.reduce_sum(weights),
+    #                     name="seq_loss_with_mask")
     if return_details:
         return loss, losses, weights, targets
     else:
@@ -308,10 +402,10 @@ def cosine_similarity(v1, v2):
     -----------
     a tensor of [batch_size, ]
     """
-    try: ## TF1.0
-        cost = tf.reduce_sum(tf.multiply(v1, v2), 1) / (tf.sqrt(tf.reduce_sum(tf.multiply(v1, v1), 1)) * tf.sqrt(tf.reduce_sum(tf.multiply(v2, v2), 1)))
-    except: ## TF0.12
-        cost = tf.reduce_sum(tf.mul(v1, v2), reduction_indices=1) / (tf.sqrt(tf.reduce_sum(tf.mul(v1, v1), reduction_indices=1)) * tf.sqrt(tf.reduce_sum(tf.mul(v2, v2), reduction_indices=1)))
+    # try: ## TF1.0
+    cost = tf.reduce_sum(tf.multiply(v1, v2), 1) / (tf.sqrt(tf.reduce_sum(tf.multiply(v1, v1), 1)) * tf.sqrt(tf.reduce_sum(tf.multiply(v2, v2), 1)))
+    # except: ## TF0.12
+    #     cost = tf.reduce_sum(tf.mul(v1, v2), reduction_indices=1) / (tf.sqrt(tf.reduce_sum(tf.mul(v1, v1), reduction_indices=1)) * tf.sqrt(tf.reduce_sum(tf.mul(v2, v2), reduction_indices=1)))
     return cost
 
 
@@ -359,14 +453,14 @@ def li(weights, name=None):
         my_scale = ops.convert_to_tensor(scale,
                                            dtype=weights.dtype.base_dtype,
                                            name='scale')
-        if tf.__version__ <= '0.12':
-            standard_ops_fn = standard_ops.mul
-        else:
-            standard_ops_fn = standard_ops.multiply
-            return standard_ops_fn(
-              my_scale,
-              standard_ops.reduce_sum(standard_ops.sqrt(standard_ops.reduce_sum(tf.square(weights), 1))),
-              name=scope)
+        # if tf.__version__ <= '0.12':
+        #     standard_ops_fn = standard_ops.mul
+        # else:
+        standard_ops_fn = standard_ops.multiply
+        return standard_ops_fn(
+          my_scale,
+          standard_ops.reduce_sum(standard_ops.sqrt(standard_ops.reduce_sum(tf.square(weights), 1))),
+          name=scope)
   return li
 
 
@@ -414,10 +508,10 @@ def lo(weights, name='lo_regularizer'):
         my_scale = ops.convert_to_tensor(scale,
                                        dtype=weights.dtype.base_dtype,
                                        name='scale')
-        if tf.__version__ <= '0.12':
-            standard_ops_fn = standard_ops.mul
-        else:
-            standard_ops_fn = standard_ops.multiply
+        # if tf.__version__ <= '0.12':
+        #     standard_ops_fn = standard_ops.mul
+        # else:
+        standard_ops_fn = standard_ops.multiply
         return standard_ops_fn(
           my_scale,
           standard_ops.reduce_sum(standard_ops.sqrt(standard_ops.reduce_sum(tf.square(weights), 0))),
@@ -467,10 +561,10 @@ def mn(weights, name='max_regularizer'):
           my_scale = ops.convert_to_tensor(scale,
                                            dtype=weights.dtype.base_dtype,
                                            name='scale')
-          if tf.__version__ <= '0.12':
-              standard_ops_fn = standard_ops.mul
-          else:
-              standard_ops_fn = standard_ops.multiply
+        #   if tf.__version__ <= '0.12':
+        #       standard_ops_fn = standard_ops.mul
+        #   else:
+          standard_ops_fn = standard_ops.multiply
           return standard_ops_fn(my_scale, standard_ops.reduce_max(standard_ops.abs(weights)), name=scope)
   return mn
 
diff --git a/tensorlayer/db.py b/tensorlayer/db.py
index af9f5ba7..3858224b 100644
--- a/tensorlayer/db.py
+++ b/tensorlayer/db.py
@@ -1,5 +1,5 @@
 #! /usr/bin/python
-# -*- coding: utf8 -*-
+# -*- coding: utf-8 -*-
 """
 Experimental Database Management System.
 
diff --git a/tensorlayer/distributed.py b/tensorlayer/distributed.py
new file mode 100644
index 00000000..6e9c223a
--- /dev/null
+++ b/tensorlayer/distributed.py
@@ -0,0 +1,308 @@
+#! /usr/bin/python
+# -*- coding: utf-8 -*-
+import tensorflow as tf
+from tensorflow.python.training import session_run_hook
+import os
+import sys
+import json
+import time
+
+
+class TaskSpecDef(object):
+    """Specification for the distributed task with the job name, index of the task,
+    the parameter servers and the worker servers. If you want to use the last worker
+    for continuous evaluation you can call the method `user_last_worker_as_evaluator`
+    which returns a new :class:`TaskSpecDef` object without the last worker in the
+    cluster specification.
+
+    Parameters
+    ----------
+    type : A string with the job name, it will be `master`, `worker` or `ps`.
+    index : The zero-based index of the task. Distributed training jobs will have a single
+        master task, one or more parameter servers, and one or more workers.
+    trial : The identifier of the trial being run.
+    ps_hosts : A string with a coma separate list of hosts for the parameter servers
+        or a list of hosts.
+    worker_hosts : A string with a coma separate list of hosts for the worker servers
+        or a list of hosts.
+    master : A string with the master hosts
+
+    Note
+    ----------
+    master might not be included in TF_CONFIG and can be None. The shard_index is adjusted
+    in any case to assign 0 to master and >= 1 to workers.
+    This implementation doesn't support sparse arrays in the `TF_CONFIG` variable as the
+    official TensorFlow documentation shows, as it is not a supported by the json
+    definition.
+
+    References
+    ----------
+    - `ML-engine trainer considerations <https://cloud.google.com/ml-engine/docs/trainer-considerations#use_tf_config>`_
+    """
+
+    def __init__(self, type='master', index=0, trial=None, ps_hosts=None, worker_hosts=None,
+                 master=None):
+        self.type = type
+        self._index = int(index)
+        self._cluster_spec = None
+        self.num_workers = 1
+        self.num_ps = 0
+        self.shard_index = int(index)
+        self._master = True
+        self.trial = trial
+        self.ps_hosts = ps_hosts
+        self.worker_hosts = worker_hosts
+        self.master = master
+
+        if ps_hosts and worker_hosts:
+            ps = ps_hosts if isinstance(ps_hosts, list) else ps_hosts.split(',')
+            self.num_ps = len(ps)
+            worker = worker_hosts if isinstance(worker_hosts, list) else worker_hosts.split(',')
+            if master is not None and len(master) > 0:
+                self._cluster_spec = tf.train.ClusterSpec({'ps'    : ps,
+                                                           'worker': worker,
+                                                           'master': master})
+                # master is a worker too
+                self.num_workers = len(worker) + 1
+                if self.type == 'worker':
+                    self.shard_index = self._index + 1
+                self._master = self.type == 'master'
+            else:
+                self._cluster_spec = tf.train.ClusterSpec({'ps'    : ps,
+                                                           'worker': worker})
+                if self.type == 'worker':
+                    self.shard_index = self._index
+                self._master = self.type == 'worker' and self._index == 0
+
+            # create server and join if it is a parameter server
+            self._server = tf.train.Server(self._cluster_spec,
+                                           job_name=self.type,
+                                           task_index=self._index)
+            if self.is_ps():
+                self._server.join()
+        else:
+            self._server = None
+
+    def is_ps(self):
+        """Returns true if this server is a parameter server"""
+        return self.type == 'ps'
+
+    def is_worker(self):
+        """Returns true if this server is a worker server"""
+        return self.type == 'worker'
+
+    def is_master(self):
+        """Returns true if this server is the master server"""
+        return self._master
+
+    def is_evaluator(self):
+        """Returns true if this server is the evaluator server"""
+        return self.type == 'worker' and len(self.worker_hosts) == self._index
+
+    def device_fn(self):
+        """Returns the function with the specification to create the graph in this server"""
+        current_device = '/job:{}/task:{}'.format(self.type, self._index)
+        ps_devices = '/job:ps'
+        return tf.train.replica_device_setter(ps_device=ps_devices,
+                                              worker_device=current_device,
+                                              cluster=self._cluster_spec)
+
+    def target(self):
+        if self._server is not None:
+            return self._server.target
+        else:
+            return None
+
+    def user_last_worker_as_evaluator(self):
+        """ Returns a new :class:`TaskSpecDef` where the last worker has been removed from
+         the list of worker_hosts, so it is not used for training anymore. You can call
+         is_evaluator to know whether this server is the evaluator one or not.
+         In case there is only one server for training this method raises an exception, as
+         you cannot use any server for evaluation.
+         """
+        if self.worker_hosts is None \
+                or len(self.worker_hosts) == 0 \
+                or (self.master is None and len(self.worker_hosts) == 1):
+            raise Exception('You need more than one worker instance to use one as evaluator')
+        return TaskSpecDef(type=self.type,
+                           index=self._index,
+                           trial=self.trial,
+                           ps_hosts=self.ps_hosts,
+                           worker_hosts=self.worker_hosts[:-1],
+                           master=self.master)
+
+
+def TaskSpec():
+    """Returns the a :class:`TaskSpecDef` based on the environment variables for distributed
+    training.
+
+    References
+    ----------
+    - `ML-engine trainer considerations <https://cloud.google.com/ml-engine/docs/trainer-considerations#use_tf_config>`_
+    - `TensorPort Distributed Computing <https://www.tensorport.com/documentation/code-details/>`_
+    """
+
+    # TF_CONFIG is used in ML-engine
+    if 'TF_CONFIG' in os.environ:
+        env = json.loads(os.environ.get('TF_CONFIG', '{}'))
+        task_data = env.get('task', None) or {'type': 'master', 'index': 0}
+        cluster_data = env.get('cluster', None) or {'ps': None, 'worker': None, 'master': None}
+        return TaskSpecDef(type=task_data['type'],
+                           index=task_data['index'],
+                           trial=task_data['trial'] if 'trial' in task_data else None,
+                           ps_hosts=cluster_data['ps'],
+                           worker_hosts=cluster_data['worker'],
+                           master=cluster_data['master'] if 'master' in cluster_data else None)
+
+    # JOB_NAME, TASK_INDEX, PS_HOSTS, WORKER_HOSTS and MASTER_HOST are used in TensorPort
+    if 'JOB_NAME' in os.environ:
+        return TaskSpecDef(type=os.environ['JOB_NAME'],
+                        index=os.environ['TASK_INDEX'],
+                        ps_hosts=os.environ.get('PS_HOSTS', None),
+                        worker_hosts=os.environ.get('WORKER_HOSTS', None),
+                        master=os.environ.get('MASTER_HOST', None))
+    return None
+
+
+def DistributedSession(task_spec=None,
+                       checkpoint_dir=None,
+                       scaffold=None,
+                       hooks=None,
+                       chief_only_hooks=None,
+                       save_checkpoint_secs=600,
+                       save_summaries_steps=object(),
+                       save_summaries_secs=object(),
+                       config=None,
+                       stop_grace_period_secs=120,
+                       log_step_count_steps=100):
+    """Creates a distributed session. It calls `MonitoredTrainingSession` to create a
+    :class:`MonitoredSession` for distributed training.
+
+    Examples
+    --------
+
+    A simple example for distributed training where all the workers use the same dataset:
+
+    >>> task_spec = TaskSpec()
+    >>> with tf.device(task_spec.device_fn()):
+    >>>      tensors = create_graph()
+    >>> with tl.DistributedSession(task_spec=task_spec,
+    ...                            checkpoint_dir='/tmp/ckpt') as session:
+    >>>      while not session.should_stop():
+    >>>           session.run(tensors)
+
+    An example where the dataset is shared among the workers
+    (see https://www.tensorflow.org/programmers_guide/datasets):
+
+    >>> task_spec = TaskSpec()
+    >>> # dataset is a :class:`tf.data.Dataset` with the raw data
+    >>> dataset = create_dataset()
+    >>> if task_spec is not None:
+    >>>     dataset = dataset.shard(task_spec.num_workers, task_spec.shard_index)
+    >>> # shuffle or apply a map function to the new sharded dataset, for example:
+    >>> dataset = dataset.shuffle(buffer_size=10000)
+    >>> dataset = dataset.batch(batch_size)
+    >>> dataset = dataset.repeat(num_epochs)
+    >>> # create the iterator for the dataset and the input tensor
+    >>> iterator = dataset.make_one_shot_iterator()
+    >>> next_element = iterator.get_next()
+    >>> with tf.device(task_spec.device_fn()):
+    >>>      # next_element is the input for the graph
+    >>>      tensors = create_graph(next_element)
+    >>> with tl.DistributedSession(task_spec=task_spec,
+    ...                            checkpoint_dir='/tmp/ckpt') as session:
+    >>>      while not session.should_stop():
+    >>>           session.run(tensors)
+
+
+    Parameters
+    ----------
+    task_spec : TaskSpecDef. The task spec definition from TaskSpec()
+    checkpoint_dir : A string.  Optional path to a directory where to restore
+      variables.
+    scaffold : A `Scaffold` used for gathering or building supportive ops. If
+      not specified, a default one is created. It's used to finalize the graph.
+    hooks : Optional list of `SessionRunHook` objects.
+    chief_only_hooks : list of `SessionRunHook` objects. Activate these hooks if
+      `is_chief==True`, ignore otherwise.
+    save_checkpoint_secs : The frequency, in seconds, that a checkpoint is saved
+      using a default checkpoint saver. If `save_checkpoint_secs` is set to
+      `None`, then the default checkpoint saver isn't used.
+    save_summaries_steps : The frequency, in number of global steps, that the
+      summaries are written to disk using a default summary saver. If both
+      `save_summaries_steps` and `save_summaries_secs` are set to `None`, then
+      the default summary saver isn't used. Default 100.
+    save_summaries_secs : The frequency, in secs, that the summaries are written
+      to disk using a default summary saver.  If both `save_summaries_steps` and
+      `save_summaries_secs` are set to `None`, then the default summary saver
+      isn't used. Default not enabled.
+    config : an instance of `tf.ConfigProto` proto used to configure the session.
+      It's the `config` argument of constructor of `tf.Session`.
+    stop_grace_period_secs : Number of seconds given to threads to stop after
+      `close()` has been called.
+    log_step_count_steps : The frequency, in number of global steps, that the
+      global step/sec is logged.
+
+    References
+    ----------
+    - `MonitoredTrainingSession <https://www.tensorflow.org/api_docs/python/tf/train/MonitoredTrainingSession>`_
+    """
+    target = task_spec.target() if task_spec is not None else None
+    is_chief = task_spec.is_master() if task_spec is not None else True
+    return tf.train.MonitoredTrainingSession(master=target,
+                                             is_chief=is_chief,
+                                             checkpoint_dir=checkpoint_dir,
+                                             scaffold=scaffold,
+                                             save_checkpoint_secs=save_checkpoint_secs,
+                                             save_summaries_steps=save_summaries_steps,
+                                             save_summaries_secs=save_summaries_secs,
+                                             log_step_count_steps=log_step_count_steps,
+                                             stop_grace_period_secs=stop_grace_period_secs,
+                                             config=config,
+                                             hooks=hooks,
+                                             chief_only_hooks=chief_only_hooks)
+
+
+
+class StopAtTimeHook(session_run_hook.SessionRunHook):
+    """Hook that requests stop after a specified time.
+
+    Parameters
+    ----------
+    time_running: Maximum time running in seconds
+    """
+
+    def __init__(self, time_running):
+        self._time_running = time_running
+
+    def begin(self):
+        self._end_time = time.time() + self._time_running
+
+    def after_run(self, run_context, run_values):
+        if time.time() > self._end_time:
+            run_context.request_stop()
+
+
+class LoadCheckpoint(session_run_hook.SessionRunHook):
+    """Hook that loads a checkpoint after the session is created.
+
+    >>> from tensorflow.python.ops import variables as tf_variables
+    >>> from tensorflow.python.training.monitored_session import SingularMonitoredSession
+    >>>
+    >>> tensors = create_graph()
+    >>> saver = tf.train.Saver(var_list=tf_variables.trainable_variables())
+    >>> checkpoint_hook = LoadCheckpoint(saver, my_checkpoint_file)
+    >>> with tf.SingularMonitoredSession(hooks=[checkpoint_hook]) as session:
+    >>>      while not session.should_stop():
+    >>>           session.run(tensors)
+    """
+
+    def __init__(self, saver, checkpoint):
+        self._saver = saver
+        self._checkpoint = checkpoint
+        self._loaded = False
+
+    def after_create_session(self, session, coord):
+        if not self._loaded:
+            self._loaded = True
+            self._saver.restore(self._checkpoint)
diff --git a/tensorlayer/files.py b/tensorlayer/files.py
index d97d4dbe..06446a1e 100644
--- a/tensorlayer/files.py
+++ b/tensorlayer/files.py
@@ -1,5 +1,5 @@
 #! /usr/bin/python
-# -*- coding: utf8 -*-
+# -*- coding: utf-8 -*-
 
 
 import tensorflow as tf
@@ -12,6 +12,7 @@
 import zipfile
 from . import visualize
 from . import nlp
+from . import utils
 import pickle
 from six.moves import urllib
 from six.moves import cPickle
@@ -20,7 +21,7 @@
 
 
 ## Load dataset functions
-def load_mnist_dataset(shape=(-1,784), path="data/mnist/"):
+def load_mnist_dataset(shape=(-1,784), path="data"):
     """Automatically download MNIST dataset
     and return the training, validation and test set with 50000, 10000 and 10000
     digit images respectively.
@@ -28,15 +29,16 @@ def load_mnist_dataset(shape=(-1,784), path="data/mnist/"):
     Parameters
     ----------
     shape : tuple
-        The shape of digit images, defaults to (-1,784)
+        The shape of digit images, defaults is (-1,784)
     path : string
-        Path to download data to, defaults to data/mnist/
+        The path that the data is downloaded to, defaults is ``data/mnist/``.
 
     Examples
     --------
     >>> X_train, y_train, X_val, y_val, X_test, y_test = tl.files.load_mnist_dataset(shape=(-1,784))
     >>> X_train, y_train, X_val, y_val, X_test, y_test = tl.files.load_mnist_dataset(shape=(-1, 28, 28, 1))
     """
+    path = os.path.join(path, 'mnist')
     # We first define functions for loading MNIST images and labels.
     # For convenience, they also download the requested files if needed.
     def load_mnist_images(path, filename):
@@ -83,8 +85,7 @@ def load_mnist_labels(path, filename):
     y_test = np.asarray(y_test, dtype=np.int32)
     return X_train, y_train, X_val, y_val, X_test, y_test
 
-
-def load_cifar10_dataset(shape=(-1, 32, 32, 3), path='data/cifar10/', plotable=False, second=3):
+def load_cifar10_dataset(shape=(-1, 32, 32, 3), path='data', plotable=False, second=3):
     """The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, with
     6000 images per class. There are 50000 training images and 10000 test images.
 
@@ -97,34 +98,17 @@ def load_cifar10_dataset(shape=(-1, 32, 32, 3), path='data/cifar10/', plotable=F
     Parameters
     ----------
     shape : tupe
-        The shape of digit images: e.g. (-1, 3, 32, 32) , (-1, 32, 32, 3) , (-1, 32*32*3)
+        The shape of digit images: e.g. (-1, 3, 32, 32) and (-1, 32, 32, 3).
     plotable : True, False
         Whether to plot some image examples.
     second : int
         If ``plotable`` is True, ``second`` is the display time.
     path : string
-        Path to download data to, defaults to data/cifar10/
+        The path that the data is downloaded to, defaults is ``data/cifar10/``.
 
     Examples
     --------
-    >>> X_train, y_train, X_test, y_test = tl.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=True)
-
-    Notes
-    ------
-    CIFAR-10 images can only be display without color change under uint8.
-    >>> X_train = np.asarray(X_train, dtype=np.uint8)
-    >>> plt.ion()
-    >>> fig = plt.figure(1232)
-    >>> count = 1
-    >>> for row in range(10):
-    >>>     for col in range(10):
-    >>>         a = fig.add_subplot(10, 10, count)
-    >>>         plt.imshow(X_train[count-1], interpolation='nearest')
-    >>>         plt.gca().xaxis.set_major_locator(plt.NullLocator())    # 不显示刻度(tick)
-    >>>         plt.gca().yaxis.set_major_locator(plt.NullLocator())
-    >>>         count = count + 1
-    >>> plt.draw()
-    >>> plt.pause(3)
+    >>> X_train, y_train, X_test, y_test = tl.files.load_cifar10_dataset(shape=(-1, 32, 32, 3))
 
     References
     ----------
@@ -132,7 +116,7 @@ def load_cifar10_dataset(shape=(-1, 32, 32, 3), path='data/cifar10/', plotable=F
     - `Data download link <https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz>`_
     - `Code references <https://teratail.com/questions/28932>`_
     """
-
+    path = os.path.join(path, 'cifar10')
     print("Load or Download cifar10 > {}".format(path))
 
     #Helper function to unpickle the data
@@ -218,40 +202,17 @@ def unpickle(file):
 
     return X_train, y_train, X_test, y_test
 
-
-def load_ptb_dataset(path='data/ptb/'):
+def load_ptb_dataset(path='data'):
     """Penn TreeBank (PTB) dataset is used in many LANGUAGE MODELING papers,
     including "Empirical Evaluation and Combination of Advanced Language
     Modeling Techniques", "Recurrent Neural Network Regularization".
-
     It consists of 929k training words, 73k validation words, and 82k test
     words. It has 10k words in its vocabulary.
 
-    In "Recurrent Neural Network Regularization", they trained regularized LSTMs
-    of two sizes; these are denoted the medium LSTM and large LSTM. Both LSTMs
-    have two layers and are unrolled for 35 steps. They initialize the hidden
-    states to zero. They then use the final hidden states of the current
-    minibatch as the initial hidden state of the subsequent minibatch
-    (successive minibatches sequentially traverse the training set).
-    The size of each minibatch is 20.
-
-    The medium LSTM has 650 units per layer and its parameters are initialized
-    uniformly in [−0.05, 0.05]. They apply 50% dropout on the non-recurrent
-    connections. They train the LSTM for 39 epochs with a learning rate of 1,
-    and after 6 epochs they decrease it by a factor of 1.2 after each epoch.
-    They clip the norm of the gradients (normalized by minibatch size) at 5.
-
-    The large LSTM has 1500 units per layer and its parameters are initialized
-    uniformly in [−0.04, 0.04]. We apply 65% dropout on the non-recurrent
-    connections. They train the model for 55 epochs with a learning rate of 1;
-    after 14 epochs they start to reduce the learning rate by a factor of 1.15
-    after each epoch. They clip the norm of the gradients (normalized by
-    minibatch size) at 10.
-
     Parameters
     ----------
     path : : string
-        Path to download data to, defaults to data/ptb/
+        The path that the data is downloaded to, defaults is ``data/ptb/``.
 
     Returns
     --------
@@ -261,14 +222,12 @@ def load_ptb_dataset(path='data/ptb/'):
     --------
     >>> train_data, valid_data, test_data, vocab_size = tl.files.load_ptb_dataset()
 
-    Code References
+    References
     ---------------
     - ``tensorflow.models.rnn.ptb import reader``
-
-    Download Links
-    ---------------
     - `Manual download <http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz>`_
     """
+    path = os.path.join(path, 'ptb')
     print("Load or Download Penn TreeBank (PTB) dataset > {}".format(path))
 
     #Maybe dowload and uncompress tar, or load exsisting files
@@ -295,8 +254,7 @@ def load_ptb_dataset(path='data/ptb/'):
     # exit()
     return train_data, valid_data, test_data, vocabulary
 
-
-def load_matt_mahoney_text8_dataset(path='data/mm_test8/'):
+def load_matt_mahoney_text8_dataset(path='data'):
     """Download a text file from Matt Mahoney's website
     if not present, and make sure it's the right size.
     Extract the first file enclosed in a zip file as a list of words.
@@ -305,7 +263,7 @@ def load_matt_mahoney_text8_dataset(path='data/mm_test8/'):
     Parameters
     ----------
     path : : string
-        Path to download data to, defaults to data/mm_test8/
+        The path that the data is downloaded to, defaults is ``data/mm_test8/``.
 
     Returns
     --------
@@ -318,7 +276,7 @@ def load_matt_mahoney_text8_dataset(path='data/mm_test8/'):
     >>> words = tl.files.load_matt_mahoney_text8_dataset()
     >>> print('Data size', len(words))
     """
-
+    path = os.path.join(path, 'mm_test8')
     print("Load or Download matt_mahoney_text8 Dataset> {}".format(path))
 
     filename = 'text8.zip'
@@ -327,11 +285,11 @@ def load_matt_mahoney_text8_dataset(path='data/mm_test8/'):
 
     with zipfile.ZipFile(os.path.join(path, filename)) as f:
         word_list = f.read(f.namelist()[0]).split()
-
+        for idx, word in enumerate(word_list):
+            word_list[idx] = word_list[idx].decode()
     return word_list
 
-
-def load_imdb_dataset(path='data/imdb/', nb_words=None, skip_top=0,
+def load_imdb_dataset(path='data', nb_words=None, skip_top=0,
               maxlen=None, test_split=0.2, seed=113,
               start_char=1, oov_char=2, index_from=3):
     """Load IMDB dataset
@@ -339,11 +297,11 @@ def load_imdb_dataset(path='data/imdb/', nb_words=None, skip_top=0,
     Parameters
     ----------
     path : : string
-        Path to download data to, defaults to data/imdb/
+        The path that the data is downloaded to, defaults is ``data/imdb/``.
 
     Examples
     --------
-    >>> X_train, y_train, X_test, y_test = tl.files.load_imbd_dataset(
+    >>> X_train, y_train, X_test, y_test = tl.files.load_imdb_dataset(
     ...                                 nb_words=20000, test_split=0.2)
     >>> print('X_train.shape', X_train.shape)
     ... (20000,)  [[1, 62, 74, ... 1033, 507, 27],[1, 60, 33, ... 13, 1053, 7]..]
@@ -354,6 +312,7 @@ def load_imdb_dataset(path='data/imdb/', nb_words=None, skip_top=0,
     -----------
     - `Modified from keras. <https://github.com/fchollet/keras/blob/master/keras/datasets/imdb.py>`_
     """
+    path = os.path.join(path, 'imdb')
 
     filename = "imdb.pkl"
     url = 'https://s3.amazonaws.com/text-datasets/'
@@ -415,14 +374,14 @@ def load_imdb_dataset(path='data/imdb/', nb_words=None, skip_top=0,
 
     return X_train, y_train, X_test, y_test
 
-def load_nietzsche_dataset(path='data/nietzsche/'):
+def load_nietzsche_dataset(path='data'):
     """Load Nietzsche dataset.
     Returns a string.
 
     Parameters
     ----------
     path : string
-        Path to download data to, defaults to data/nietzsche/
+        The path that the data is downloaded to, defaults is ``data/nietzsche/``.
 
     Examples
     --------
@@ -432,6 +391,7 @@ def load_nietzsche_dataset(path='data/nietzsche/'):
     >>> words = words.split()
     """
     print("Load or Download nietzsche dataset > {}".format(path))
+    path = os.path.join(path, 'nietzsche')
 
     filename = "nietzsche.txt"
     url = 'https://s3.amazonaws.com/text-datasets/'
@@ -441,7 +401,7 @@ def load_nietzsche_dataset(path='data/nietzsche/'):
         words = f.read()
         return words
 
-def load_wmt_en_fr_dataset(path='data/wmt_en_fr/'):
+def load_wmt_en_fr_dataset(path='data'):
     """It will download English-to-French translation data from the WMT'15
     Website (10^9-French-English corpus), and the 2013 news test from
     the same site as development set.
@@ -450,7 +410,7 @@ def load_wmt_en_fr_dataset(path='data/wmt_en_fr/'):
     Parameters
     ----------
     path : string
-        Path to download data to, defaults to data/wmt_en_fr/
+        The path that the data is downloaded to, defaults is ``data/wmt_en_fr/``.
 
     References
     ----------
@@ -460,6 +420,7 @@ def load_wmt_en_fr_dataset(path='data/wmt_en_fr/'):
     -----
     Usually, it will take a long time to download this dataset.
     """
+    path = os.path.join(path, 'wmt_en_fr')
     # URLs for WMT data.
     _WMT_ENFR_TRAIN_URL = "http://www.statmt.org/wmt10/"
     _WMT_ENFR_DEV_URL = "http://www.statmt.org/wmt15/"
@@ -505,8 +466,554 @@ def get_wmt_enfr_dev_set(path):
 
     return train_path, dev_path
 
+def load_flickr25k_dataset(tag='sky', path="data", n_threads=50, printable=False):
+    """Returns a list of images by a given tag from Flick25k dataset,
+    it will download Flickr25k from `the official website <http://press.liacs.nl/mirflickr/mirdownload.html>`_
+    at the first time you use it.
+
+    Parameters
+    ------------
+    tag : string or None
+        If you want to get images with tag, use string like 'dog', 'red', see `Flickr Search <https://www.flickr.com/search/>`_.
+        If you want to get all images, set to ``None``.
+    path : string
+        The path that the data is downloaded to, defaults is ``data/flickr25k/``.
+    n_threads : int, number of thread to read image.
+    printable : bool, print infomation when reading images, default is ``False``.
+
+    Examples
+    -----------
+    - Get images with tag of sky
+    >>> images = tl.files.load_flickr25k_dataset(tag='sky')
+
+    - Get all images
+    >>> images = tl.files.load_flickr25k_dataset(tag=None, n_threads=100, printable=True)
+    """
+    path = os.path.join(path, 'flickr25k')
+
+    filename = 'mirflickr25k.zip'
+    url = 'http://press.liacs.nl/mirflickr/mirflickr25k/'
+    ## download dataset
+    if folder_exists(path+"/mirflickr") is False:
+        print("[*] Flickr25k is nonexistent in {}".format(path))
+        maybe_download_and_extract(filename, path, url, extract=True)
+        del_file(path+'/'+filename)
+    ## return images by the given tag.
+    # 1. image path list
+    folder_imgs = path+"/mirflickr"
+    path_imgs = load_file_list(path=folder_imgs, regx='\\.jpg', printable=False)
+    path_imgs.sort(key=natural_keys)
+    # print(path_imgs[0:10])
+    # 2. tag path list
+    folder_tags = path+"/mirflickr/meta/tags"
+    path_tags = load_file_list(path=folder_tags, regx='\\.txt', printable=False)
+    path_tags.sort(key=natural_keys)
+    # print(path_tags[0:10])
+    # 3. select images
+    if tag is None:
+        print("[Flickr25k] reading all images")
+    else:
+        print("[Flickr25k] reading images with tag: {}".format(tag))
+    images_list = []
+    for idx in range(0, len(path_tags)):
+        tags = read_file(folder_tags+'/'+path_tags[idx]).split('\n')
+        # print(idx+1, tags)
+        if tag is None or tag in tags:
+            images_list.append(path_imgs[idx])
+
+    images = visualize.read_images(images_list, folder_imgs, n_threads=n_threads, printable=printable)
+    return images
+
+def load_flickr1M_dataset(tag='sky', size=10, path="data", n_threads=50, printable=False):
+    """Returns a list of images by a given tag from Flickr1M dataset,
+    it will download Flickr1M from `the official website <http://press.liacs.nl/mirflickr/mirdownload.html>`_
+    at the first time you use it.
+
+    Parameters
+    ------------
+    tag : string or None
+        If you want to get images with tag, use string like 'dog', 'red', see `Flickr Search <https://www.flickr.com/search/>`_.
+        If you want to get all images, set to ``None``.
+    size : int 1 to 10.
+        1 means 100k images ... 5 means 500k images, 10 means all 1 million images. Default is 10.
+    path : string
+        The path that the data is downloaded to, defaults is ``data/flickr25k/``.
+    n_threads : int, number of thread to read image.
+    printable : bool, print infomation when reading images, default is ``False``.
+
+    Examples
+    ----------
+    - Use 200k images
+    >>> images = tl.files.load_flickr1M_dataset(tag='zebra', size=2)
+
+    - Use 1 Million images
+    >>> images = tl.files.load_flickr1M_dataset(tag='zebra')
+    """
+    path = os.path.join(path, 'flickr1M')
+    print("[Flickr1M] using {}% of images = {}".format(size*10, size*100000))
+    images_zip = ['images0.zip', 'images1.zip', 'images2.zip', 'images3.zip',
+             'images4.zip',  'images5.zip', 'images6.zip', 'images7.zip',
+             'images8.zip',  'images9.zip']
+    tag_zip = 'tags.zip'
+    url = 'http://press.liacs.nl/mirflickr/mirflickr1m/'
+    ## download dataset
+    for image_zip in images_zip[0:size]:
+        image_folder = image_zip.split(".")[0]
+        # print(path+"/"+image_folder)
+        if folder_exists(path+"/"+image_folder) is False:
+            # print(image_zip)
+            print("[Flickr1M] {} is missing in {}".format(image_folder, path))
+            maybe_download_and_extract(image_zip, path, url, extract=True)
+            del_file(path+'/'+image_zip)
+            os.system("mv {} {}".format(path+'/images',path+'/'+image_folder))
+        else:
+            print("[Flickr1M] {} exists in {}".format(image_folder, path))
+    ## download tag
+    if folder_exists(path+"/tags") is False:
+        print("[Flickr1M] tag files is nonexistent in {}".format(path))
+        maybe_download_and_extract(tag_zip, path, url, extract=True)
+        del_file(path+'/'+tag_zip)
+    else:
+        print("[Flickr1M] tags exists in {}".format(path))
+
+    ## 1. image path list
+    images_list = []
+    images_folder_list = []
+    for i in range(0, size):
+        images_folder_list += load_folder_list(path=path+'/images%d'%i)
+    images_folder_list.sort(key=lambda s : int(s.split('/')[-1]))   # folder/images/ddd
+    # print(images_folder_list)
+    # exit()
+    for folder in images_folder_list[0:size*10]:
+        tmp = load_file_list(path=folder, regx='\\.jpg', printable=False)
+        tmp.sort(key=lambda s : int(s.split('.')[-2]))  # ddd.jpg
+        # print(tmp[0::570])
+        images_list.extend([folder+'/'+x for x in tmp])
+    # print('IM', len(images_list), images_list[0::6000])
+    ## 2. tag path list
+    tag_list = []
+    tag_folder_list = load_folder_list(path+"/tags")
+    tag_folder_list.sort(key=lambda s : int(s.split('/')[-1]))  # folder/images/ddd
+
+    for folder in tag_folder_list[0:size*10]:
+        # print(folder)
+        tmp = load_file_list(path=folder, regx='\\.txt', printable=False)
+        tmp.sort(key=lambda s : int(s.split('.')[-2])) # ddd.txt
+        tmp = [folder+'/'+s for s in tmp]
+        tag_list += tmp
+    # print('T', len(tag_list), tag_list[0::6000])
+    # exit()
+    ## 3. select images
+    print("[Flickr1M] searching tag: {}".format(tag))
+    select_images_list = []
+    for idx in range(0, len(tag_list)):
+        tags = read_file(tag_list[idx]).split('\n')
+        if tag in tags:
+            select_images_list.append(images_list[idx])
+            # print(idx, tags, tag_list[idx], images_list[idx])
+    print("[Flickr1M] reading images with tag: {}".format(tag))
+    images = visualize.read_images(select_images_list, '', n_threads=n_threads, printable=printable)
+    return images
+
+def load_cyclegan_dataset(filename='summer2winter_yosemite', path='data'):
+    """Load image data from CycleGAN's database, see `this link <https://people.eecs.berkeley.edu/~taesung_park/CycleGAN/datasets/>`_.
 
-## Load and save network
+    Parameters
+    ------------
+    filename : string
+        The dataset you want, see `this link <https://people.eecs.berkeley.edu/~taesung_park/CycleGAN/datasets/>`_.
+    path : string
+        The path that the data is downloaded to, defaults is `data/cyclegan`
+
+    Examples
+    ---------
+    >>> im_train_A, im_train_B, im_test_A, im_test_B = load_cyclegan_dataset(filename='summer2winter_yosemite')
+    """
+    path = os.path.join(path, 'cyclegan')
+    url = 'https://people.eecs.berkeley.edu/~taesung_park/CycleGAN/datasets/'
+
+    if folder_exists(os.path.join(path, filename)) is False:
+        print("[*] {} is nonexistent in {}".format(filename, path))
+        maybe_download_and_extract(filename+'.zip', path, url, extract=True)
+        del_file(os.path.join(path, filename+'.zip'))
+
+    def load_image_from_folder(path):
+        path_imgs = load_file_list(path=path, regx='\\.jpg', printable=False)
+        return visualize.read_images(path_imgs, path=path, n_threads=10, printable=False)
+    im_train_A = load_image_from_folder(os.path.join(path, filename, "trainA"))
+    im_train_B = load_image_from_folder(os.path.join(path, filename, "trainB"))
+    im_test_A = load_image_from_folder(os.path.join(path, filename, "testA"))
+    im_test_B = load_image_from_folder(os.path.join(path, filename, "testB"))
+
+    def if_2d_to_3d(images):         # [h, w] --> [h, w, 3]
+        for i in range(len(images)):
+            if len(images[i].shape) == 2:
+                images[i] = images[i][:, :, np.newaxis]
+                images[i] = np.tile(images[i], (1, 1, 3))
+        return images
+
+    im_train_A = if_2d_to_3d(im_train_A)
+    im_train_B = if_2d_to_3d(im_train_B)
+    im_test_A = if_2d_to_3d(im_test_A)
+    im_test_B = if_2d_to_3d(im_test_B)
+
+    return im_train_A, im_train_B, im_test_A, im_test_B
+
+def download_file_from_google_drive(id, destination):
+    """ Download file from Google Drive, see ``load_celebA_dataset`` for example.
+
+    Parameters
+    --------------
+    id : driver ID
+    destination : string, save path.
+    """
+    from tqdm import tqdm
+    import requests
+    def save_response_content(response, destination, chunk_size=32*1024):
+        total_size = int(response.headers.get('content-length', 0))
+        with open(destination, "wb") as f:
+            for chunk in tqdm(response.iter_content(chunk_size), total=total_size,
+                    unit='B', unit_scale=True, desc=destination):
+                if chunk: # filter out keep-alive new chunks
+                    f.write(chunk)
+    def get_confirm_token(response):
+        for key, value in response.cookies.items():
+            if key.startswith('download_warning'):
+                return value
+        return None
+    URL = "https://docs.google.com/uc?export=download"
+    session = requests.Session()
+
+    response = session.get(URL, params={ 'id': id }, stream=True)
+    token = get_confirm_token(response)
+
+    if token:
+        params = { 'id' : id, 'confirm' : token }
+        response = session.get(URL, params=params, stream=True)
+    save_response_content(response, destination)
+
+def load_celebA_dataset(dirpath='data'):
+    """ Automatically download celebA dataset, and return a list of image path. """
+    import zipfile, os
+    data_dir = 'celebA'
+    filename, drive_id  = "img_align_celeba.zip", "0B7EVK8r0v71pZjFTYXZWM3FlRnM"
+    save_path = os.path.join(dirpath, filename)
+    image_path = os.path.join(dirpath, data_dir)
+    if os.path.exists(image_path):
+        print('[*] {} already exists'.format(save_path))
+    else:
+        exists_or_mkdir(dirpath)
+        download_file_from_google_drive(drive_id, save_path)
+        zip_dir = ''
+        with zipfile.ZipFile(save_path) as zf:
+            zip_dir = zf.namelist()[0]
+            zf.extractall(dirpath)
+        os.remove(save_path)
+        os.rename(os.path.join(dirpath, zip_dir), image_path)
+
+    data_files = load_file_list(path=image_path, regx='\\.jpg', printable=False)
+    for i in range(len(data_files)):
+        data_files[i] =  os.path.join(image_path, data_files[i])
+    return data_files
+
+def load_voc_dataset(path='data', dataset='2012', contain_classes_in_person=False):
+    """ Pascal VOC 2007/2012 Dataset has 20 objects : aeroplane, bicycle, bird, boat, bottle, bus, car, cat, chair, cow, diningtable, dog, horse, motorbike, person, pottedplant, sheep, sofa, train, tvmonitor and additional 3 classes : head, hand, foot for person.
+
+    Parameters
+    -----------
+    path : string
+        The path that the data is downloaded to, defaults is ``data/VOC``.
+    dataset : string, 2012, 2007, 2007test or 2012test.
+        The VOC dataset version, we usually train model on 2007+2012 and test it on 2007test.
+    contain_classes_in_person : If True, dataset will contains labels of head, hand and foot.
+
+    Returns
+    ---------
+    imgs_file_list : list of string.
+        Full paths of all images.
+    imgs_semseg_file_list : list of string.
+        Full paths of all maps for semantic segmentation. Note that not all images have this map!
+    imgs_insseg_file_list : list of string.
+        Full paths of all maps for instance segmentation. Note that not all images have this map!
+    imgs_ann_file_list : list of string.
+        Full paths of all annotations for bounding box and object class, all images have this annotations.
+    classes : list of string.
+        Classes in order.
+    classes_in_person : list of string.
+        Classes in person.
+    classes_dict : dictionary.
+        Class label to integer.
+    n_objs_list : list of integer
+        Number of objects in all images in ``imgs_file_list` in order.
+    objs_info_list : list of string.
+        Darknet format for the annotation of all images in ``imgs_file_list`` in order. ``[class_id x_centre y_centre width height]`` in ratio format.
+    objs_info_dicts : dictionary.
+        ``{imgs_file_list : dictionary for annotation}``, the annotation of all images in ``imgs_file_list``,
+        format from `TensorFlow/Models/object-detection <https://github.com/tensorflow/models/blob/master/object_detection/create_pascal_tf_record.py>`_.
+
+    Examples
+    ----------
+    >>> imgs_file_list, imgs_semseg_file_list, imgs_insseg_file_list, imgs_ann_file_list,
+    >>>     classes, classes_in_person, classes_dict,
+    >>>     n_objs_list, objs_info_list, objs_info_dicts = tl.files.load_voc_dataset(dataset="2012", contain_classes_in_person=False)
+    >>> idx = 26
+    >>> print(classes)
+    ... ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']
+    >>> print(classes_dict)
+    ... {'sheep': 16, 'horse': 12, 'bicycle': 1, 'bottle': 4, 'cow': 9, 'sofa': 17, 'car': 6, 'dog': 11, 'cat': 7, 'person': 14, 'train': 18, 'diningtable': 10, 'aeroplane': 0, 'bus': 5, 'pottedplant': 15, 'tvmonitor': 19, 'chair': 8, 'bird': 2, 'boat': 3, 'motorbike': 13}
+    >>> print(imgs_file_list[idx])
+    ... data/VOC/VOC2012/JPEGImages/2007_000423.jpg
+    >>> print(n_objs_list[idx])
+    ... 2
+    >>> print(imgs_ann_file_list[idx])
+    ... data/VOC/VOC2012/Annotations/2007_000423.xml
+    >>> print(objs_info_list[idx])
+    ... 14 0.173 0.461333333333 0.142 0.496
+    ... 14 0.828 0.542666666667 0.188 0.594666666667
+    >>> ann = tl.prepro.parse_darknet_ann_str_to_list(objs_info_list[idx])
+    >>> print(ann)
+    ... [[14, 0.173, 0.461333333333, 0.142, 0.496], [14, 0.828, 0.542666666667, 0.188, 0.594666666667]]
+    >>> c, b = tl.prepro.parse_darknet_ann_list_to_cls_box(ann)
+    >>> print(c, b)
+    ... [14, 14] [[0.173, 0.461333333333, 0.142, 0.496], [0.828, 0.542666666667, 0.188, 0.594666666667]]
+
+    References
+    -------------
+    - `Pascal VOC2012 Website <http://host.robots.ox.ac.uk/pascal/VOC/voc2012/#devkit>`_.
+    - `Pascal VOC2007 Website <http://host.robots.ox.ac.uk/pascal/VOC/voc2007/>`_.
+    - `TensorFlow/Models/object-detection <https://github.com/zsdonghao/object-detection/blob/master/g3doc/preparing_inputs.md>`_.
+    """
+    path= os.path.join(path, 'VOC')
+
+    def _recursive_parse_xml_to_dict(xml):
+      """Recursively parses XML contents to python dict.
+      We assume that `object` tags are the only ones that can appear
+      multiple times at the same level of a tree.
+
+      Args:
+        xml: xml tree obtained by parsing XML file contents using lxml.etree
+
+      Returns:
+        Python dictionary holding XML contents.
+      """
+      if not xml:
+      # if xml is not None:
+        return {xml.tag: xml.text}
+      result = {}
+      for child in xml:
+        child_result = _recursive_parse_xml_to_dict(child)
+        if child.tag != 'object':
+          result[child.tag] = child_result[child.tag]
+        else:
+          if child.tag not in result:
+            result[child.tag] = []
+          result[child.tag].append(child_result[child.tag])
+      return {xml.tag: result}
+
+    from lxml import etree # pip install lxml
+    import xml.etree.ElementTree as ET
+
+    ##
+    if dataset == "2012":
+        url = "http://host.robots.ox.ac.uk/pascal/VOC/voc2012/"
+        tar_filename = "VOCtrainval_11-May-2012.tar"
+        extracted_filename = "VOC2012"#"VOCdevkit/VOC2012"
+        print("    [============= VOC 2012 =============]")
+    elif dataset == "2012test":
+        extracted_filename = "VOC2012test"#"VOCdevkit/VOC2012"
+        print("    [============= VOC 2012 Test Set =============]")
+        print("    \nAuthor: 2012test only have person annotation, so 2007test is highly recommended for testing !\n")
+        import time
+        time.sleep(3)
+        if os.path.isdir(os.path.join(path, extracted_filename)) is False:
+            print("For VOC 2012 Test data - online registration required")
+            print(" Please download VOC2012test.tar from:  \n register: http://host.robots.ox.ac.uk:8080 \n voc2012 : http://host.robots.ox.ac.uk:8080/eval/challenges/voc2012/ \ndownload: http://host.robots.ox.ac.uk:8080/eval/downloads/VOC2012test.tar")
+            print(" unzip VOC2012test.tar,rename the folder to VOC2012test and put it into %s" % path)
+            exit()
+        # # http://host.robots.ox.ac.uk:8080/eval/downloads/VOC2012test.tar
+        # url = "http://host.robots.ox.ac.uk:8080/eval/downloads/"
+        # tar_filename = "VOC2012test.tar"
+    elif dataset == "2007":
+        url = "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/"
+        tar_filename = "VOCtrainval_06-Nov-2007.tar"
+        extracted_filename = "VOC2007"
+        print("    [============= VOC 2007 =============]")
+    elif dataset == "2007test":
+        # http://host.robots.ox.ac.uk/pascal/VOC/voc2007/index.html#testdata
+        # http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
+        url = "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/"
+        tar_filename = "VOCtest_06-Nov-2007.tar"
+        extracted_filename = "VOC2007test"
+        print("    [============= VOC 2007 Test Set =============]")
+    else:
+        raise Exception("Please set the dataset aug to 2012, 2012test or 2007.")
+
+    ##======== download dataset
+    if dataset != "2012test":
+        from sys import platform as _platform
+        if folder_exists(os.path.join(path, extracted_filename)) is False:
+            print("[VOC] {} is nonexistent in {}".format(extracted_filename, path))
+            maybe_download_and_extract(tar_filename, path, url, extract=True)
+            del_file(os.path.join(path, tar_filename))
+            if dataset == "2012":
+                if _platform == "win32":
+                    os.system("mv {}\VOCdevkit\VOC2012 {}\VOC2012".format(path, path))
+                else:
+                    os.system("mv {}/VOCdevkit/VOC2012 {}/VOC2012".format(path, path))
+            elif dataset == "2007":
+                if _platform == "win32":
+                    os.system("mv {}\VOCdevkit\VOC2007 {}\VOC2007".format(path, path))
+                else:
+                    os.system("mv {}/VOCdevkit/VOC2007 {}/VOC2007".format(path, path))
+            elif dataset == "2007test":
+                if _platform == "win32":
+                    os.system("mv {}\VOCdevkit\VOC2007 {}\VOC2007test".format(path, path))
+                else:
+                    os.system("mv {}/VOCdevkit/VOC2007 {}/VOC2007test".format(path, path))
+            del_folder(os.path.join(path, 'VOCdevkit'))
+    ##======== object classes(labels)  NOTE: YOU CAN CUSTOMIZE THIS LIST
+    classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car",
+            "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike",
+            "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
+    if contain_classes_in_person:
+        classes_in_person = ["head", "hand", "foot"]
+    else:
+        classes_in_person = []
+
+    classes += classes_in_person    # use extra 3 classes for person
+
+    classes_dict = utils.list_string_to_dict(classes)
+    print("[VOC] object classes {}".format(classes_dict))
+
+    ##======== 1. image path list
+    # folder_imgs = path+"/"+extracted_filename+"/JPEGImages/"
+    folder_imgs = os.path.join(path, extracted_filename, "JPEGImages")
+    imgs_file_list = load_file_list(path=folder_imgs, regx='\\.jpg', printable=False)
+    print("[VOC] {} images found".format(len(imgs_file_list)))
+    imgs_file_list.sort(key=lambda s : int(s.replace('.',' ').replace('_', '').split(' ')[-2])) # 2007_000027.jpg --> 2007000027
+    imgs_file_list = [os.path.join(folder_imgs, s) for s in imgs_file_list]
+        # print('IM',imgs_file_list[0::3333], imgs_file_list[-1])
+    if dataset != "2012test":
+        ##======== 2. semantic segmentation maps path list
+        # folder_semseg = path+"/"+extracted_filename+"/SegmentationClass/"
+        folder_semseg = os.path.join(path, extracted_filename, "SegmentationClass")
+        imgs_semseg_file_list = load_file_list(path=folder_semseg, regx='\\.png', printable=False)
+        print("[VOC] {} maps for semantic segmentation found".format(len(imgs_semseg_file_list)))
+        imgs_semseg_file_list.sort(key=lambda s : int(s.replace('.',' ').replace('_', '').split(' ')[-2])) # 2007_000032.png --> 2007000032
+        imgs_semseg_file_list = [os.path.join(folder_semseg, s) for s in imgs_semseg_file_list]
+            # print('Semantic Seg IM',imgs_semseg_file_list[0::333], imgs_semseg_file_list[-1])
+        ##======== 3. instance segmentation maps path list
+        # folder_insseg = path+"/"+extracted_filename+"/SegmentationObject/"
+        folder_insseg = os.path.join(path, extracted_filename, "SegmentationObject")
+        imgs_insseg_file_list = load_file_list(path=folder_insseg, regx='\\.png', printable=False)
+        print("[VOC] {} maps for instance segmentation found".format(len(imgs_semseg_file_list)))
+        imgs_insseg_file_list.sort(key=lambda s : int(s.replace('.',' ').replace('_', '').split(' ')[-2])) # 2007_000032.png --> 2007000032
+        imgs_insseg_file_list = [os.path.join(folder_insseg, s) for s in imgs_insseg_file_list]
+            # print('Instance Seg IM',imgs_insseg_file_list[0::333], imgs_insseg_file_list[-1])
+    else:
+        imgs_semseg_file_list = []
+        imgs_insseg_file_list = []
+    ##======== 4. annotations for bounding box and object class
+    # folder_ann = path+"/"+extracted_filename+"/Annotations/"
+    folder_ann = os.path.join(path, extracted_filename, "Annotations")
+    imgs_ann_file_list = load_file_list(path=folder_ann, regx='\\.xml', printable=False)
+    print("[VOC] {} XML annotation files for bounding box and object class found".format(len(imgs_ann_file_list)))
+    imgs_ann_file_list.sort(key=lambda s : int(s.replace('.',' ').replace('_', '').split(' ')[-2])) # 2007_000027.xml --> 2007000027
+    imgs_ann_file_list = [os.path.join(folder_ann, s) for s in imgs_ann_file_list]
+        # print('ANN',imgs_ann_file_list[0::3333], imgs_ann_file_list[-1])
+
+    if dataset == "2012test": # remove unused images in JPEG folder
+        imgs_file_list_new = []
+        for ann in imgs_ann_file_list:
+            ann = os.path.split(ann)[-1].split('.')[0]
+            for im in imgs_file_list:
+                if ann in im:
+                    imgs_file_list_new.append(im)
+                    break
+        imgs_file_list = imgs_file_list_new
+        print("[VOC] keep %d images" % len(imgs_file_list_new))
+
+    ##======== parse XML annotations
+    def convert(size, box):
+        dw = 1./size[0]
+        dh = 1./size[1]
+        x = (box[0] + box[1])/2.0
+        y = (box[2] + box[3])/2.0
+        w = box[1] - box[0]
+        h = box[3] - box[2]
+        x = x*dw
+        w = w*dw
+        y = y*dh
+        h = h*dh
+        return (x,y,w,h)
+
+    def convert_annotation(file_name):
+        """ Given VOC2012 XML Annotations, returns number of objects and info. """
+        in_file = open(file_name)
+        out_file = ""
+        tree = ET.parse(in_file)
+        root = tree.getroot()
+        size = root.find('size')
+        w = int(size.find('width').text)
+        h = int(size.find('height').text)
+        n_objs = 0
+
+        # print(file_name, w, h, size)
+        # exit()
+        for obj in root.iter('object'):
+            if dataset != "2012test":
+                difficult = obj.find('difficult').text
+                cls = obj.find('name').text
+                if cls not in classes or int(difficult) == 1:
+                    continue
+            else:
+                cls = obj.find('name').text
+                if cls not in classes:
+                    continue
+            cls_id = classes.index(cls)
+            xmlbox = obj.find('bndbox')
+            b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text), float(xmlbox.find('ymax').text))
+            bb = convert((w,h), b)
+            # out_file.write(str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n')
+            out_file += str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n'
+            n_objs += 1
+            if cls in "person":
+                for part in obj.iter('part'):
+                    cls = part.find('name').text
+                    if cls not in classes_in_person:
+                        continue
+                    cls_id = classes.index(cls)
+                    xmlbox = part.find('bndbox')
+                    b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text), float(xmlbox.find('ymax').text))
+                    bb = convert((w,h), b)
+                    # out_file.write(str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n')
+                    out_file += str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n'
+                    n_objs += 1
+        in_file.close()
+        return n_objs, out_file
+
+    print("[VOC] Parsing xml annotations files")
+    n_objs_list = []
+    objs_info_list = [] # Darknet Format list of string
+    objs_info_dicts = {}
+    for idx, ann_file in enumerate(imgs_ann_file_list):
+        # print(ann_file)
+        n_objs, objs_info = convert_annotation(ann_file)
+        n_objs_list.append(n_objs)
+        objs_info_list.append(objs_info)
+        with tf.gfile.GFile(ann_file, 'r') as fid:
+            xml_str = fid.read()
+        xml = etree.fromstring(xml_str)
+        data = _recursive_parse_xml_to_dict(xml)['annotation']
+        objs_info_dicts.update({imgs_file_list[idx]: data})
+
+    return imgs_file_list, imgs_semseg_file_list, imgs_insseg_file_list, imgs_ann_file_list, \
+        classes, classes_in_person, classes_dict,\
+        n_objs_list, objs_info_list, objs_info_dicts
+
+
+
+## Load and save network list npz
 def save_npz(save_list=[], name='model.npz', sess=None):
     """Input parameters and the file name, save parameters into .npz file. Use tl.utils.load_npz() to restore.
 
@@ -520,16 +1027,13 @@ def save_npz(save_list=[], name='model.npz', sess=None):
 
     Examples
     --------
-    >>> tl.files.save_npz(network.all_params, name='model_test.npz', sess=sess)
-    ... File saved to: model_test.npz
-    >>> load_params = tl.files.load_npz(name='model_test.npz')
-    ... Loading param0, (784, 800)
-    ... Loading param1, (800,)
-    ... Loading param2, (800, 800)
-    ... Loading param3, (800,)
-    ... Loading param4, (800, 10)
-    ... Loading param5, (10,)
-    >>> put parameters into a TensorLayer network, please see assign_params()
+    - Save model to npz
+    >>> tl.files.save_npz(network.all_params, name='model.npz', sess=sess)
+    - Load model from npz (Method 1)
+    >>> load_params = tl.files.load_npz(name='model.npz')
+    >>> tl.files.assign_params(sess, load_params, network)
+    - Load model from npz (Method 2)
+    >>> tl.files.load_and_assign_npz(sess=sess, name='model.npz', network=network)
 
     Notes
     -----
@@ -561,40 +1065,6 @@ def save_npz(save_list=[], name='model.npz', sess=None):
     # np.savez(name, **rename_dict)
     # print('Model is saved to: %s' % name)
 
-def save_npz_dict(save_list=[], name='model.npz', sess=None):
-    """Input parameters and the file name, save parameters as a dictionary into .npz file. Use tl.utils.load_npz_dict() to restore.
-
-    Parameters
-    ----------
-    save_list : a list
-        Parameters want to be saved.
-    name : a string or None
-        The name of the .npz file.
-    sess : None or Session
-
-    Notes
-    -----
-    This function tries to avoid a potential broadcasting error raised by numpy.
-
-    """
-    ## save params into a list
-    save_list_var = []
-    if sess:
-        save_list_var = sess.run(save_list)
-    else:
-        try:
-            for k, value in enumerate(save_list):
-                save_list_var.append(value.eval())
-        except:
-            print(" Fail to save model, Hint: pass the session into this function, save_npz_dict(network.all_params, name='model.npz', sess=sess)")
-    save_var_dict = {str(idx):val for idx, val in enumerate(save_list_var)}
-    np.savez(name, **save_var_dict)
-    save_list_var = None
-    save_var_dict = None
-    del save_list_var
-    del save_var_dict
-    print("[*] %s saved" % name)
-
 def load_npz(path='', name='model.npz'):
     """Load the parameters of a Model saved by tl.files.save_npz().
 
@@ -612,7 +1082,7 @@ def load_npz(path='', name='model.npz'):
 
     Examples
     --------
-    - See save_npz and assign_params
+    - See ``save_npz``
 
     References
     ----------
@@ -636,25 +1106,6 @@ def load_npz(path='', name='model.npz'):
     # exit()
     # return d.items()[0][1]['params']
 
-def load_npz_dict(path='', name='model.npz'):
-    """Load the parameters of a Model saved by tl.files.save_npz_dict().
-
-    Parameters
-    ----------
-    path : a string
-        Folder path to .npz file.
-    name : a string or None
-        The name of the .npz file.
-
-    Returns
-    --------
-    params : list
-        A list of parameters in order.
-    """
-    d = np.load( path+name )
-    saved_list_var = [val[1] for val in sorted(d.items(), key=lambda tup: int(tup[0]))]
-    return saved_list_var
-
 def assign_params(sess, params, network):
     """Assign the given parameters to the TensorLayer network.
 
@@ -673,15 +1124,13 @@ def assign_params(sess, params, network):
 
     Examples
     --------
-    >>> Save your network as follow:
-    >>> tl.files.save_npz(network.all_params, name='model_test.npz')
-    >>> network.print_params()
-    ...
-    ... Next time, load and assign your network as follow:
-    >>> tl.layers.initialize_global_variables(sess)
-    >>> load_params = tl.files.load_npz(name='model_test.npz')
+    - Save model to npz
+    >>> tl.files.save_npz(network.all_params, name='model.npz', sess=sess)
+    - Load model from npz (Method 1)
+    >>> load_params = tl.files.load_npz(name='model.npz')
     >>> tl.files.assign_params(sess, load_params, network)
-    >>> network.print_params()
+    - Load model from npz (Method 2)
+    >>> tl.files.load_and_assign_npz(sess=sess, name='model.npz', network=network)
 
     References
     ----------
@@ -711,6 +1160,7 @@ def load_and_assign_npz(sess=None, name=None, network=None):
 
     Examples
     ---------
+    >>> tl.files.save_npz(net.all_params, name='net.npz', sess=sess)
     >>> tl.files.load_and_assign_npz(sess=sess, name='net.npz', network=net)
     """
     assert network is not None
@@ -724,7 +1174,202 @@ def load_and_assign_npz(sess=None, name=None, network=None):
         print("[*] Load {} SUCCESS!".format(name))
         return network
 
-# Load and save variables
+## Load and save network dict npz
+def save_npz_dict(save_list=[], name='model.npz', sess=None):
+    """Input parameters and the file name, save parameters as a dictionary into .npz file.
+    Use ``tl.files.load_and_assign_npz_dict()`` to restore.
+
+    Parameters
+    ----------
+    save_list : a list to tensor for parameters
+        Parameters want to be saved.
+    name : a string
+        The name of the .npz file.
+    sess : Session
+    """
+    assert sess is not None
+    save_list_names = [tensor.name for tensor in save_list]
+    save_list_var = sess.run(save_list)
+    save_var_dict = {save_list_names[idx]: val for idx, val in enumerate(save_list_var)}
+    np.savez(name, **save_var_dict)
+    save_list_var = None
+    save_var_dict = None
+    del save_list_var
+    del save_var_dict
+    print("[*] Model saved in npz_dict %s" % name)
+
+def load_and_assign_npz_dict(name='model.npz', sess=None):
+    """Restore the parameters saved by ``tl.files.save_npz_dict()``.
+
+    Parameters
+    ----------
+    name : a string
+        The name of the .npz file.
+    sess : Session
+    """
+    assert sess is not None
+    if not os.path.exists(name):
+        print("[!] Load {} failed!".format(name))
+        return False
+
+    params = np.load(name)
+    if len(params.keys()) != len(set(params.keys())):
+        raise Exception("Duplication in model npz_dict %s" % name)
+    ops = list()
+    for key in params.keys():
+        try:
+            # tensor = tf.get_default_graph().get_tensor_by_name(key)
+            # varlist = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=key)
+            varlist = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=key)
+            if len(varlist) > 1:
+                raise Exception("[!] Multiple candidate variables to be assigned for name %s" % key)
+            elif len(varlist) == 0:
+                raise KeyError
+            else:
+                ops.append(varlist[0].assign(params[key]))
+                print("[*] params restored: %s" % key)
+        except KeyError:
+            print("[!] Warning: Tensor named %s not found in network." % key)
+
+    sess.run(ops)
+    print("[*] Model restored from npz_dict %s" % name)
+
+# def save_npz_dict(save_list=[], name='model.npz', sess=None):
+#     """Input parameters and the file name, save parameters as a dictionary into .npz file. Use tl.utils.load_npz_dict() to restore.
+#
+#     Parameters
+#     ----------
+#     save_list : a list
+#         Parameters want to be saved.
+#     name : a string or None
+#         The name of the .npz file.
+#     sess : None or Session
+#
+#     Notes
+#     -----
+#     This function tries to avoid a potential broadcasting error raised by numpy.
+#
+#     """
+#     ## save params into a list
+#     save_list_var = []
+#     if sess:
+#         save_list_var = sess.run(save_list)
+#     else:
+#         try:
+#             for k, value in enumerate(save_list):
+#                 save_list_var.append(value.eval())
+#         except:
+#             print(" Fail to save model, Hint: pass the session into this function, save_npz_dict(network.all_params, name='model.npz', sess=sess)")
+#     save_var_dict = {str(idx):val for idx, val in enumerate(save_list_var)}
+#     np.savez(name, **save_var_dict)
+#     save_list_var = None
+#     save_var_dict = None
+#     del save_list_var
+#     del save_var_dict
+#     print("[*] %s saved" % name)
+#
+# def load_npz_dict(path='', name='model.npz'):
+#     """Load the parameters of a Model saved by tl.files.save_npz_dict().
+#
+#     Parameters
+#     ----------
+#     path : a string
+#         Folder path to .npz file.
+#     name : a string or None
+#         The name of the .npz file.
+#
+#     Returns
+#     --------
+#     params : list
+#         A list of parameters in order.
+#     """
+#     d = np.load( path+name )
+#     saved_list_var = [val[1] for val in sorted(d.items(), key=lambda tup: int(tup[0]))]
+#     return saved_list_var
+
+
+
+## Load and save network ckpt
+def save_ckpt(sess=None, mode_name='model.ckpt', save_dir='checkpoint', var_list=[], global_step=None, printable=False):
+    """Save parameters into ckpt file.
+
+    Parameters
+    ------------
+    sess : Session.
+    mode_name : string, name of the model, default is ``model.ckpt``.
+    save_dir : string, path / file directory to the ckpt, default is ``checkpoint``.
+    var_list : list of variables, if not given, save all global variables.
+    global_step : int or None, step number.
+    printable : bool, if True, print all params info.
+
+    Examples
+    ---------
+    - see ``tl.files.load_ckpt()``.
+    """
+    assert sess is not None
+    ckpt_file = os.path.join(save_dir, mode_name)
+    if var_list == []:
+        var_list = tf.global_variables()
+
+    print("[*] save %s n_params: %d" % (ckpt_file, len(var_list)))
+
+    if printable:
+        for idx, v in enumerate(var_list):
+            print("  param {:3}: {:15}   {}".format(idx, v.name, str(v.get_shape())))
+
+    saver = tf.train.Saver(var_list)
+    saver.save(sess, ckpt_file, global_step=global_step)
+
+def load_ckpt(sess=None, mode_name='model.ckpt', save_dir='checkpoint', var_list=[], is_latest=True, printable=False):
+    """Load parameters from ckpt file.
+
+    Parameters
+    ------------
+    sess : Session.
+    mode_name : string, name of the model, default is ``model.ckpt``.
+        Note that if ``is_latest`` is True, this function will get the ``mode_name`` automatically.
+    save_dir : string, path / file directory to the ckpt, default is ``checkpoint``.
+    var_list : list of variables, if not given, save all global variables.
+    is_latest : bool, if True, load the latest ckpt, if False, load the ckpt with the name of ```mode_name``.
+    printable : bool, if True, print all params info.
+
+    Examples
+    ----------
+    - Save all global parameters.
+    >>> tl.files.save_ckpt(sess=sess, mode_name='model.ckpt', save_dir='model', printable=True)
+    - Save specific parameters.
+    >>> tl.files.save_ckpt(sess=sess, mode_name='model.ckpt', var_list=net.all_params, save_dir='model', printable=True)
+    - Load latest ckpt.
+    >>> tl.files.load_ckpt(sess=sess, var_list=net.all_params, save_dir='model', printable=True)
+    - Load specific ckpt.
+    >>> tl.files.load_ckpt(sess=sess, mode_name='model.ckpt', var_list=net.all_params, save_dir='model', is_latest=False, printable=True)
+    """
+    assert sess is not None
+
+    if is_latest:
+        ckpt_file = tf.train.latest_checkpoint(save_dir)
+    else:
+        ckpt_file = os.path.join(save_dir, mode_name)
+
+    if var_list == []:
+        var_list = tf.global_variables()
+
+    print("[*] load %s n_params: %d" % (ckpt_file, len(var_list)))
+
+    if printable:
+        for idx, v in enumerate(var_list):
+            print("  param {:3}: {:15}   {}".format(idx, v.name, str(v.get_shape())))
+
+    try:
+        saver = tf.train.Saver(var_list)
+        saver.restore(sess, ckpt_file)
+    except Exception as e:
+        print(e)
+        print("[*] load ckpt fail ...")
+
+
+
+## Load and save variables
 def save_any_to_npy(save_dict={}, name='file.npy'):
     """Save variables to .npy file.
 
@@ -757,30 +1402,35 @@ def load_npy_to_any(path='', name='file.npy'):
             exit()
 
 
-# Visualizing npz files
-def npz_to_W_pdf(path=None, regx='w1pre_[0-9]+\.(npz)'):
-    """Convert the first weight matrix of .npz file to .pdf by using tl.visualize.W().
 
-    Parameters
-    ----------
-    path : a string or None
-        A folder path to npz files.
-    regx : a string
-        Regx for the file name.
+
+## Folder functions
+def file_exists(filepath):
+    """ Check whether a file exists by given file path. """
+    return os.path.isfile(filepath)
+
+def folder_exists(folderpath):
+    """ Check whether a folder exists by given folder path. """
+    return os.path.isdir(folderpath)
+
+def del_file(filepath):
+    """ Delete a file by given file path. """
+    os.remove(filepath)
+
+def del_folder(folderpath):
+    """ Delete a folder by given folder path. """
+    os.rmdir(folderpath)
+
+def read_file(filepath):
+    """ Read a file and return a string.
 
     Examples
-    --------
-    >>> Convert the first weight matrix of w1_pre...npz file to w1_pre...pdf.
-    >>> tl.files.npz_to_W_pdf(path='/Users/.../npz_file/', regx='w1pre_[0-9]+\.(npz)')
+    ---------
+    >>> data = tl.files.read_file('data.txt')
     """
-    file_list = load_file_list(path=path, regx=regx)
-    for f in file_list:
-        W = load_npz(path, f)[0]
-        print("%s --> %s" % (f, f.split('.')[0]+'.pdf'))
-        visualize.W(W, second=10, saveable=True, name=f.split('.')[0], fig_idx=2012)
+    with open(filepath, 'r') as afile:
+        return afile.read()
 
-
-## Helper functions
 def load_file_list(path=None, regx='\.npz', printable=True):
     """Return a file list in a folder by given a path and regular expression.
 
@@ -853,18 +1503,19 @@ def maybe_download_and_extract(filename, working_directory, url_source, extract=
     and optionally also tries to extract the file if format is ".zip" or ".tar"
 
     Parameters
-    ----------
+    -----------
     filename : string
         The name of the (to be) dowloaded file.
     working_directory : string
         A folder path to search for the file in and dowload the file to
     url : string
         The URL to download the file from
-    extract : bool, defaults to False
+    extract : bool, defaults is False
         If True, tries to uncompress the dowloaded file is ".tar.gz/.tar.bz2" or ".zip" file
     expected_bytes : int/None
         If set tries to verify that the downloaded file is of the specified size, otherwise raises an Exception,
-        defaults to None which corresponds to no check being performed
+        defaults is None which corresponds to no check being performed
+
     Returns
     ----------
     filepath to dowloaded (uncompressed) file
@@ -900,7 +1551,7 @@ def _dlProgress(count, blockSize, totalSize):
         _download(filename, working_directory, url_source)
         print()
         statinfo = os.stat(filepath)
-        print('Succesfully downloaded', filename, statinfo.st_size, 'bytes.')
+        print('Succesfully downloaded %s %s bytes.' % (filename, statinfo.st_size))#, 'bytes.')
         if(not(expected_bytes is None) and (expected_bytes != statinfo.st_size)):
             raise Exception('Failed to verify ' + filename + '. Can you get to it with a browser?')
         if(extract):
@@ -916,3 +1567,48 @@ def _dlProgress(count, blockSize, totalSize):
             else:
                 print("Unknown compression_format only .tar.gz/.tar.bz2/.tar and .zip supported")
     return filepath
+
+
+## Sort
+def natural_keys(text):
+    """Sort list of string with number in human order.
+
+    Examples
+    ----------
+    >>> l = ['im1.jpg', 'im31.jpg', 'im11.jpg', 'im21.jpg', 'im03.jpg', 'im05.jpg']
+    >>> l.sort(key=tl.files.natural_keys)
+    ... ['im1.jpg', 'im03.jpg', 'im05', 'im11.jpg', 'im21.jpg', 'im31.jpg']
+    >>> l.sort() # that is what we dont want
+    ... ['im03.jpg', 'im05', 'im1.jpg', 'im11.jpg', 'im21.jpg', 'im31.jpg']
+
+    References
+    ----------
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    """
+    def atoi(text):
+        return int(text) if text.isdigit() else text
+    return [ atoi(c) for c in re.split('(\d+)', text) ]
+
+# Visualizing npz files
+def npz_to_W_pdf(path=None, regx='w1pre_[0-9]+\.(npz)'):
+    """Convert the first weight matrix of .npz file to .pdf by using tl.visualize.W().
+
+    Parameters
+    ----------
+    path : a string or None
+        A folder path to npz files.
+    regx : a string
+        Regx for the file name.
+
+    Examples
+    --------
+    >>> Convert the first weight matrix of w1_pre...npz file to w1_pre...pdf.
+    >>> tl.files.npz_to_W_pdf(path='/Users/.../npz_file/', regx='w1pre_[0-9]+\.(npz)')
+    """
+    file_list = load_file_list(path=path, regx=regx)
+    for f in file_list:
+        W = load_npz(path, f)[0]
+        print("%s --> %s" % (f, f.split('.')[0]+'.pdf'))
+        visualize.W(W, second=10, saveable=True, name=f.split('.')[0], fig_idx=2012)
diff --git a/tensorlayer/iterate.py b/tensorlayer/iterate.py
index 9778fc6f..d6c973dc 100644
--- a/tensorlayer/iterate.py
+++ b/tensorlayer/iterate.py
@@ -1,5 +1,5 @@
 #! /usr/bin/python
-# -*- coding: utf8 -*-
+# -*- coding: utf-8 -*-
 
 
 
@@ -21,11 +21,6 @@ def minibatches(inputs=None, targets=None, batch_size=None, shuffle=False):
     shuffle : boolean
         Indicating whether to use a shuffling queue, shuffle the dataset before return.
 
-    Hints
-    -------
-    - If you have two inputs, e.g. X1 (1000, 100) and X2 (1000, 80), you can ``np.hstack((X1, X2))
-    into (1000, 180) and feed into ``inputs``, then you can split a batch of X1 and X2.
-
     Examples
     --------
     >>> X = np.asarray([['a','a'], ['b','b'], ['c','c'], ['d','d'], ['e','e'], ['f','f']])
@@ -41,6 +36,12 @@ def minibatches(inputs=None, targets=None, batch_size=None, shuffle=False):
     ... (array([['e', 'e'],
     ...        ['f', 'f']],
     ...         dtype='<U1'), array([4, 5]))
+
+
+    Notes
+    -------
+    - If you have two inputs, e.g. X1 (1000, 100) and X2 (1000, 80), you can ``np.hstack((X1, X2))
+    into (1000, 180) and feed into ``inputs``, then you can split a batch of X1 and X2.
     """
     assert len(inputs) == len(targets)
     if shuffle:
diff --git a/tensorlayer/layers.py b/tensorlayer/layers.py
old mode 100755
new mode 100644
index 88d5ad63..b7b76d57
--- a/tensorlayer/layers.py
+++ b/tensorlayer/layers.py
@@ -1,7 +1,5 @@
 #! /usr/bin/python
-# -*- coding: utf8 -*-
-
-
+# -*- coding: utf-8 -*-
 
 import tensorflow as tf
 import time
@@ -21,12 +19,13 @@
 #     "DenseLayer",
 # ]
 
-
 # set_keep = locals()
 set_keep = globals()
 set_keep['_layers_name_list'] =[]
 set_keep['name_reuse'] = False
 
+D_TYPE = tf.float32
+
 try:  # For TF12 and later
     TF_GRAPHKEYS_VARIABLES = tf.GraphKeys.GLOBAL_VARIABLES
 except:  # For TF11 and before
@@ -100,13 +99,13 @@ def set_name_reuse(enable=True):
     ...                     embedding_size = embedding_size,
     ...                     name = 'e_embedding')
     >>>        network = tl.layers.DynamicRNNLayer(network,
-    ...                     cell_fn = tf.nn.rnn_cell.BasicLSTMCell,
+    ...                     cell_fn = tf.contrib.rnn.BasicLSTMCell,
     ...                     n_hidden = embedding_size,
     ...                     dropout = (0.7 if is_train else None),
     ...                     initializer = w_init,
     ...                     sequence_length = tl.layers.retrieve_seq_length_op2(input_seqs),
     ...                     return_last = True,
-    ...                     name = 'e_dynamicrnn',)
+    ...                     name = 'e_dynamicrnn')
     >>>    return network
     >>>
     >>> net_train = embed_seq(t_caption, is_train=True, reuse=False)
@@ -116,13 +115,15 @@ def set_name_reuse(enable=True):
     """
     set_keep['name_reuse'] = enable
 
-def initialize_rnn_state(state):
-    """Return the initialized RNN state.
-    The input is LSTMStateTuple or State of RNNCells.
+def initialize_rnn_state(state, feed_dict=None):
+    """Returns the initialized RNN state.
+    The inputs are LSTMStateTuple or State of RNNCells and an optional feed_dict.
 
     Parameters
     -----------
     state : a RNN state.
+    feed_dict : None or a dictionary for initializing the state values (optional).
+        If None, returns the zero state.
     """
     try: # TF1.0
         LSTMStateTuple = tf.contrib.rnn.LSTMStateTuple
@@ -130,11 +131,11 @@ def initialize_rnn_state(state):
         LSTMStateTuple = tf.nn.rnn_cell.LSTMStateTuple
 
     if isinstance(state, LSTMStateTuple):
-        c = state.c.eval()
-        h = state.h.eval()
+        c = state.c.eval(feed_dict=feed_dict)
+        h = state.h.eval(feed_dict=feed_dict)
         return (c, h)
     else:
-        new_state = state.eval()
+        new_state = state.eval(feed_dict=feed_dict)
         return new_state
 
 def print_all_variables(train_only=False):
@@ -151,7 +152,7 @@ def print_all_variables(train_only=False):
         t_vars = tf.trainable_variables()
         print("  [*] printing trainable variables")
     else:
-        try: # TF1.0
+        try: # TF1.0+
             t_vars = tf.global_variables()
         except: # TF0.12
             t_vars = tf.all_variables()
@@ -171,7 +172,7 @@ def get_variables_with_name(name, train_only=True, printable=False):
     if train_only:
         t_vars = tf.trainable_variables()
     else:
-        try: # TF1.0
+        try: # TF1.0+
             t_vars = tf.global_variables()
         except: # TF0.12
             t_vars = tf.all_variables()
@@ -199,7 +200,6 @@ def get_layers_with_name(network=None, name="", printable=False):
         if name in layer.name:
             layers.append(layer)
             if printable:
-                # print(layer.name)
                 print("  got {:3}: {:15}   {}".format(i, layer.name, str(layer.get_shape())))
                 i = i + 1
     return layers
@@ -222,19 +222,52 @@ def list_remove_repeat(l=None):
     [l2.append(i) for i in l if not i in l2]
     return l2
 
+def merge_networks(layers=[]):
+    """Merge all parameters, layers and dropout probabilities to a :class:`Layer`.
+
+    Parameters
+    ----------
+    layer : list of :class:`Layer` instance
+        Merge all parameters, layers and dropout probabilities to the first layer in the list.
+
+    Examples
+    ---------
+    >>> n1 = ...
+    >>> n2 = ...
+    >>> n1 = merge_networks([n1, n2])
+    """
+    layer = layers[0]
+
+    all_params = []
+    all_layers = []
+    all_drop = {}
+    for l in layers:
+        all_params.extend(l.all_params)
+        all_layers.extend(l.all_layers)
+        all_drop.update(l.all_drop)
+
+    layer.all_params = list(all_params)
+    layer.all_layers = list(all_layers)
+    layer.all_drop = dict(all_drop)
+
+    layer.all_layers = list_remove_repeat(layer.all_layers)
+    layer.all_params = list_remove_repeat(layer.all_params)
+
+    return layer
+
 def initialize_global_variables(sess=None):
-    """Excute ``sess.run(tf.global_variables_initializer())`` for TF12+ or
-    sess.run(tf.initialize_all_variables()) for TF11.
+    """Excute ``sess.run(tf.global_variables_initializer())`` for TF 0.12+ or
+    ``sess.run(tf.initialize_all_variables())`` for TF 0.11.
 
     Parameters
     ----------
     sess : a Session
     """
     assert sess is not None
-    try:    # TF12
-        sess.run(tf.global_variables_initializer())
-    except: # TF11
-        sess.run(tf.initialize_all_variables())
+    # try:    # TF12+
+    sess.run(tf.global_variables_initializer())
+    # except: # TF11
+    #     sess.run(tf.initialize_all_variables())
 
 
 ## Basic layer
@@ -259,10 +292,10 @@ def __init__(
         name ='layer'
     ):
         self.inputs = inputs
-        scope_name=tf.get_variable_scope().name
+        scope_name = tf.get_variable_scope().name
         if scope_name:
             name = scope_name + '/' + name
-        if (name in set_keep['_layers_name_list']) and name_reuse == False:
+        if (name in set_keep['_layers_name_list']) and set_keep['name_reuse'] == False:
             raise Exception("Layer '%s' already exists, please choice other 'name' or reuse this layer\
             \nHint : Use different name for different 'Layer' (The name is used to control parameter sharing)" % name)
         else:
@@ -270,13 +303,13 @@ def __init__(
             if name not in ['', None, False]:
                 set_keep['_layers_name_list'].append(name)
 
-    def print_params(self, details=True):
+    def print_params(self, details=True, session=None):
         ''' Print all info of parameters in the network'''
         for i, p in enumerate(self.all_params):
             if details:
                 try:
                     # print("  param {:3}: {:15} (mean: {:<18}, median: {:<18}, std: {:<18})   {}".format(i, str(p.eval().shape), p.eval().mean(), np.median(p.eval()), p.eval().std(), p.name))
-                    val = p.eval()
+                    val = p.eval(session=session)
                     print("  param {:3}: {:20} {:15}    {} (mean: {:<18}, median: {:<18}, std: {:<18})   ".format(i, p.name, str(val.shape), p.dtype.name, val.mean(), np.median(val), val.std()))
                 except Exception as e:
                     print(str(e))
@@ -410,7 +443,7 @@ class Word2vecEmbeddingInputlayer(Layer):
     name : a string or None
         An optional name to attach to this layer.
 
-    Variables
+    Attributes
     --------------
     nce_cost : a tensor
         The NCE loss.
@@ -488,26 +521,29 @@ def __init__(
             embeddings = tf.get_variable(name='embeddings',
                                     shape=(vocabulary_size, embedding_size),
                                     initializer=E_init,
+                                    dtype=D_TYPE,
                                     **E_init_args)
             embed = tf.nn.embedding_lookup(embeddings, self.inputs)
             # Construct the variables for the NCE loss (i.e. negative sampling)
             nce_weights = tf.get_variable(name='nce_weights',
                                     shape=(vocabulary_size, embedding_size),
                                     initializer=nce_W_init,
+                                    dtype=D_TYPE,
                                     **nce_W_init_args)
             nce_biases = tf.get_variable(name='nce_biases',
                                     shape=(vocabulary_size),
                                     initializer=nce_b_init,
+                                    dtype=D_TYPE,
                                     **nce_b_init_args)
 
         # Compute the average NCE loss for the batch.
         # tf.nce_loss automatically draws a new sample of the negative labels
         # each time we evaluate the loss.
         self.nce_cost = tf.reduce_mean(
-            tf.nn.nce_loss(weights=nce_weights, biases=nce_biases,
-                           inputs=embed, labels=train_labels,
-                           num_sampled=num_sampled, num_classes=vocabulary_size,
-                           **nce_loss_args))
+                tf.nn.nce_loss(weights=nce_weights, biases=nce_biases,
+                inputs=embed, labels=train_labels,
+                num_sampled=num_sampled, num_classes=vocabulary_size,
+                **nce_loss_args))
 
         self.outputs = embed
         self.normalized_embeddings = tf.nn.l2_normalize(embeddings, 1)
@@ -543,7 +579,7 @@ class EmbeddingInputlayer(Layer):
     name : a string or None
         An optional name to attach to this layer.
 
-    Variables
+    Attributes
     ------------
     outputs : a tensor
         The outputs of embedding layer.
@@ -611,6 +647,7 @@ def __init__(
             embeddings = tf.get_variable(name='embeddings',
                                     shape=(vocabulary_size, embedding_size),
                                     initializer=E_init,
+                                    dtype=D_TYPE,
                                     **E_init_args)
             embed = tf.nn.embedding_lookup(embeddings, self.inputs)
 
@@ -620,6 +657,88 @@ def __init__(
         self.all_params = [embeddings]
         self.all_drop = {}
 
+
+class AverageEmbeddingInputlayer(Layer):
+    """The :class:`AverageEmbeddingInputlayer` averages over embeddings of inputs, can be used as the input layer for models like DAN[1] and FastText[2].
+
+    Parameters
+    ------------
+    inputs : input placeholder or tensor
+    vocabulary_size : an integer, the size of vocabulary
+    embedding_size : an integer, the dimension of embedding vectors
+    pad_value : an integer, the scalar pad value used in inputs
+    name : a string, the name of the layer
+    embeddings_initializer : the initializer of the embedding matrix
+    embeddings_kwargs : kwargs to get embedding matrix variable
+
+    References
+    ------------
+    - [1] Iyyer, M., Manjunatha, V., Boyd-Graber, J., & Daum’e III, H. (2015). Deep Unordered Composition Rivals Syntactic Methods for Text Classification. In Association for Computational Linguistics.
+    - [2] Joulin, A., Grave, E., Bojanowski, P., & Mikolov, T. (2016).`Bag of Tricks for Efficient Text Classification. <http://arxiv.org/abs/1607.01759>`_
+    """
+    def __init__(
+            self, inputs, vocabulary_size, embedding_size,
+            pad_value=0,
+            name='average_embedding_layer',
+            embeddings_initializer=tf.random_uniform_initializer(-0.1, 0.1),
+            embeddings_kwargs=None,
+    ):
+        super().__init__(name=name)
+
+        # if embeddings_kwargs is None:
+        #     embeddings_kwargs = {}
+
+        if inputs.get_shape().ndims != 2:
+            raise ValueError(
+                'inputs must be of size batch_size * batch_sentence_length')
+
+        self.inputs = inputs
+
+        print("  [TL] AverageEmbeddingInputlayer %s: (%d, %d)" % (name, vocabulary_size, embedding_size))
+        with tf.variable_scope(name):
+            self.embeddings = tf.get_variable(
+                name='embeddings',
+                shape=(vocabulary_size, embedding_size),
+                initializer=embeddings_initializer,
+                dtype=D_TYPE,
+                **(embeddings_kwargs or {})
+                # **embeddings_kwargs
+            ) # **(embeddings_kwargs or {}),
+
+            word_embeddings = tf.nn.embedding_lookup(
+                self.embeddings, self.inputs,
+                name='word_embeddings',
+            )
+            # Zero out embeddings of pad value
+            masks = tf.not_equal(self.inputs, pad_value, name='masks')
+            word_embeddings *= tf.cast(
+                tf.expand_dims(masks, axis=-1),
+                # tf.float32,
+                dtype=D_TYPE,
+            )
+            sum_word_embeddings = tf.reduce_sum(word_embeddings, axis=1)
+
+            # Count number of non-padding words in each sentence
+            sentence_lengths = tf.count_nonzero(
+                masks,
+                axis=1,
+                keep_dims=True,
+                # dtype=tf.float32,
+                dtype=D_TYPE,
+                name='sentence_lengths',
+            )
+
+            sentence_embeddings = tf.divide(
+                sum_word_embeddings,
+                sentence_lengths + 1e-8,  # Add epsilon to avoid dividing by 0
+                name='sentence_embeddings'
+            )
+
+        self.outputs = sentence_embeddings
+        self.all_layers = [self.outputs]
+        self.all_params = [self.embeddings]
+        self.all_drop = {}
+
 ## Dense layer
 class DenseLayer(Layer):
     """
@@ -686,12 +805,12 @@ def __init__(
         self.n_units = n_units
         print("  [TL] DenseLayer  %s: %d %s" % (self.name, self.n_units, act.__name__))
         with tf.variable_scope(name) as vs:
-            W = tf.get_variable(name='W', shape=(n_in, n_units), initializer=W_init, **W_init_args )
+            W = tf.get_variable(name='W', shape=(n_in, n_units), initializer=W_init, dtype=D_TYPE, **W_init_args )
             if b_init is not None:
                 try:
-                    b = tf.get_variable(name='b', shape=(n_units), initializer=b_init, **b_init_args )
+                    b = tf.get_variable(name='b', shape=(n_units), initializer=b_init, dtype=D_TYPE, **b_init_args )
                 except: # If initializer is a constant, do not specify shape.
-                    b = tf.get_variable(name='b', initializer=b_init, **b_init_args )
+                    b = tf.get_variable(name='b', initializer=b_init, dtype=D_TYPE, **b_init_args )
                 self.outputs = act(tf.matmul(self.inputs, W) + b)
             else:
                 self.outputs = act(tf.matmul(self.inputs, W))
@@ -778,7 +897,7 @@ def __init__(
         print("     lambda_l2_w: %f" % lambda_l2_w)
         print("     learning_rate: %f" % learning_rate)
 
-        # Mean-squre-error i.e. quadratic-cost
+        # Mean-square-error i.e. quadratic-cost
         mse = tf.reduce_sum(tf.squared_difference(y, x_recon),  1)
         mse = tf.reduce_mean(mse)            # in theano: mse = ((y - x) ** 2 ).sum(axis=1).mean()
             # mse = tf.reduce_mean(tf.reduce_sum(tf.square(tf.sub(y, x_recon)),  1))
@@ -1099,8 +1218,8 @@ def __init__(
         print("  [TL] DropconnectDenseLayer %s: %d %s" % (self.name, self.n_units, act.__name__))
 
         with tf.variable_scope(name) as vs:
-            W = tf.get_variable(name='W', shape=(n_in, n_units), initializer=W_init, **W_init_args )
-            b = tf.get_variable(name='b', shape=(n_units), initializer=b_init, **b_init_args )
+            W = tf.get_variable(name='W', shape=(n_in, n_units), initializer=W_init, dtype=D_TYPE, **W_init_args )
+            b = tf.get_variable(name='b', shape=(n_units), initializer=b_init, dtype=D_TYPE, **b_init_args )
             self.outputs = act(tf.matmul(self.inputs, W) + b)#, name=name)    # 1.2
 
         set_keep[name] = tf.placeholder(tf.float32)
@@ -1169,7 +1288,7 @@ def __init__(
         if act is None:
             act = tf.identity
         with tf.variable_scope(name) as vs:
-            W = tf.get_variable(name='W_conv1d', shape=shape, initializer=W_init, **W_init_args )
+            W = tf.get_variable(name='W_conv1d', shape=shape, initializer=W_init, dtype=D_TYPE, **W_init_args )
             self.outputs = tf.nn.convolution(
                 self.inputs,
                 W,
@@ -1179,7 +1298,7 @@ def __init__(
                 data_format=data_format
             ) #1.2
             if b_init:
-                b = tf.get_variable(name='b_conv1d', shape=(shape[-1]), initializer=b_init, **b_init_args )
+                b = tf.get_variable(name='b_conv1d', shape=(shape[-1]), initializer=b_init, dtype=D_TYPE, **b_init_args )
                 self.outputs = self.outputs + b
 
             self.outputs = act(self.outputs)
@@ -1218,8 +1337,8 @@ class Conv2dLayer(Layer):
         The arguments for the weights tf.get_variable().
     b_init_args : dictionary
         The arguments for the biases tf.get_variable().
-    use_cudnn_on_gpu : an optional string from: "NHWC", "NCHW". Defaults to "NHWC".
-    data_format : an optional bool. Defaults to True.
+    use_cudnn_on_gpu : bool, default is None.
+    data_format : string "NHWC" or "NCHW", default is "NHWC"
     name : a string or None
         An optional name to attach to this layer.
 
@@ -1277,9 +1396,9 @@ def __init__(
                             (self.name, str(shape), str(strides), padding, act.__name__))
 
         with tf.variable_scope(name) as vs:
-            W = tf.get_variable(name='W_conv2d', shape=shape, initializer=W_init, **W_init_args )
+            W = tf.get_variable(name='W_conv2d', shape=shape, initializer=W_init, dtype=D_TYPE, **W_init_args )
             if b_init:
-                b = tf.get_variable(name='b_conv2d', shape=(shape[-1]), initializer=b_init, **b_init_args )
+                b = tf.get_variable(name='b_conv2d', shape=(shape[-1]), initializer=b_init, dtype=D_TYPE, **b_init_args )
                 self.outputs = act( tf.nn.conv2d(self.inputs, W, strides=strides, padding=padding, use_cudnn_on_gpu=use_cudnn_on_gpu, data_format=data_format) + b )
             else:
                 self.outputs = act( tf.nn.conv2d(self.inputs, W, strides=strides, padding=padding, use_cudnn_on_gpu=use_cudnn_on_gpu, data_format=data_format))
@@ -1383,9 +1502,9 @@ def __init__(
                             (self.name, str(shape), str(output_shape), str(strides), padding, act.__name__))
         # print("  DeConv2dLayer: Untested")
         with tf.variable_scope(name) as vs:
-            W = tf.get_variable(name='W_deconv2d', shape=shape, initializer=W_init, **W_init_args )
+            W = tf.get_variable(name='W_deconv2d', shape=shape, initializer=W_init, dtype=D_TYPE, **W_init_args )
             if b_init:
-                b = tf.get_variable(name='b_deconv2d', shape=(shape[-2]), initializer=b_init, **b_init_args )
+                b = tf.get_variable(name='b_deconv2d', shape=(shape[-2]), initializer=b_init, dtype=D_TYPE, **b_init_args )
                 self.outputs = act( tf.nn.conv2d_transpose(self.inputs, W, output_shape=output_shape, strides=strides, padding=padding) + b )
             else:
                 self.outputs = act( tf.nn.conv2d_transpose(self.inputs, W, output_shape=output_shape, strides=strides, padding=padding))
@@ -1446,8 +1565,8 @@ def __init__(
         with tf.variable_scope(name) as vs:
             # W = tf.Variable(W_init(shape=shape, **W_init_args), name='W_conv')
             # b = tf.Variable(b_init(shape=[shape[-1]], **b_init_args), name='b_conv')
-            W = tf.get_variable(name='W_conv3d', shape=shape, initializer=W_init, **W_init_args )
-            b = tf.get_variable(name='b_conv3d', shape=(shape[-1]), initializer=b_init, **b_init_args )
+            W = tf.get_variable(name='W_conv3d', shape=shape, initializer=W_init, dtype=D_TYPE, **W_init_args )
+            b = tf.get_variable(name='b_conv3d', shape=(shape[-1]), initializer=b_init, dtype=D_TYPE, **b_init_args )
             self.outputs = act( tf.nn.conv3d(self.inputs, W, strides=strides, padding=padding, name=None) + b )
 
         # self.outputs = act( tf.nn.conv3d(self.inputs, W, strides=strides, padding=padding, name=None) + b )
@@ -1506,8 +1625,8 @@ def __init__(
                             (self.name, str(shape), str(output_shape), str(strides), padding, act.__name__))
 
         with tf.variable_scope(name) as vs:
-            W = tf.get_variable(name='W_deconv3d', shape=shape, initializer=W_init, **W_init_args )
-            b = tf.get_variable(name='b_deconv3d', shape=(shape[-2]), initializer=b_init, **b_init_args )
+            W = tf.get_variable(name='W_deconv3d', shape=shape, initializer=W_init, dtype=D_TYPE, **W_init_args )
+            b = tf.get_variable(name='b_deconv3d', shape=(shape[-2]), initializer=b_init, dtype=D_TYPE, **b_init_args )
 
             self.outputs = act( tf.nn.conv3d_transpose(self.inputs, W, output_shape=output_shape, strides=strides, padding=padding) + b )
 
@@ -1626,6 +1745,237 @@ def __init__(
         self.all_layers.extend( [self.outputs] )
 
 
+# ## 2D deformable convolutional layer
+def _to_bc_h_w(x, x_shape):
+    """(b, h, w, c) -> (b*c, h, w)"""
+    x = tf.transpose(x, [0, 3, 1, 2])
+    x = tf.reshape(x, (-1, x_shape[1], x_shape[2]))
+    return x
+
+def _to_b_h_w_n_c(x, x_shape):
+    """(b*c, h, w, n) -> (b, h, w, n, c)"""
+    x = tf.reshape(
+        x, (-1, x_shape[4], x_shape[1], x_shape[2], x_shape[3]))
+    x = tf.transpose(x, [0, 2, 3, 4, 1])
+    return x
+
+def tf_repeat(a, repeats):
+    """TensorFlow version of np.repeat for 1D"""
+    # https://github.com/tensorflow/tensorflow/issues/8521
+    assert len(a.get_shape()) == 1
+
+    a = tf.expand_dims(a, -1)
+    a = tf.tile(a, [1, repeats])
+    a = tf_flatten(a)
+    return a
+
+def tf_batch_map_coordinates(inputs, coords):
+    """Batch version of tf_map_coordinates
+
+    Only supports 2D feature maps
+
+    Parameters
+    ----------
+    input : tf.Tensor. shape = (b*c, h, w)
+    coords : tf.Tensor. shape = (b*c, h, w, n, 2)
+
+    Returns
+    -------
+    tf.Tensor. shape = (b*c, h, w, n)
+    """
+
+    input_shape = inputs.get_shape()
+    coords_shape = coords.get_shape()
+    batch_channel = tf.shape(inputs)[0]
+    input_h = int(input_shape[1])
+    input_w = int(input_shape[2])
+    kernel_n = int(coords_shape[3])
+    n_coords = input_h * input_w * kernel_n
+
+    coords_lt = tf.cast(tf.floor(coords), 'int32')
+    coords_rb = tf.cast(tf.ceil(coords), 'int32')
+    coords_lb = tf.stack([coords_lt[:, :, :, :, 0], coords_rb[:, :, :, :, 1]], axis=-1)
+    coords_rt = tf.stack([coords_rb[:, :, :, :, 0], coords_lt[:, :, :, :, 1]], axis=-1)
+
+    idx = tf_repeat(tf.range(batch_channel), n_coords)
+
+    vals_lt = _get_vals_by_coords(inputs, coords_lt, idx, (batch_channel, input_h, input_w, kernel_n))
+    vals_rb = _get_vals_by_coords(inputs, coords_rb, idx, (batch_channel, input_h, input_w, kernel_n))
+    vals_lb = _get_vals_by_coords(inputs, coords_lb, idx, (batch_channel, input_h, input_w, kernel_n))
+    vals_rt = _get_vals_by_coords(inputs, coords_rt, idx, (batch_channel, input_h, input_w, kernel_n))
+
+    coords_offset_lt = coords - tf.cast(coords_lt, 'float32')
+
+    vals_t = vals_lt + (vals_rt - vals_lt) * coords_offset_lt[:, :, :, :, 0]
+    vals_b = vals_lb + (vals_rb - vals_lb) * coords_offset_lt[:, :, :, :, 0]
+    mapped_vals = vals_t + (vals_b - vals_t) * coords_offset_lt[:, :, :, :, 1]
+
+    return mapped_vals
+
+def tf_batch_map_offsets(inputs, offsets, grid_offset):
+    """Batch map offsets into input
+
+    Parameters
+    ---------
+    inputs : tf.Tensor. shape = (b, h, w, c)
+    offsets: tf.Tensor. shape = (b, h, w, 2*n)
+    grid_offset: Offset grids shape = (h, w, n, 2)
+
+    Returns
+    -------
+    tf.Tensor. shape = (b, h, w, c)
+    """
+
+    input_shape = inputs.get_shape()
+    batch_size = tf.shape(inputs)[0]
+    kernel_n = int(int(offsets.get_shape()[3])/2)
+    input_h = input_shape[1]
+    input_w = input_shape[2]
+    channel = input_shape[3]
+
+    # inputs (b, h, w, c) --> (b*c, h, w)
+    inputs = _to_bc_h_w(inputs, input_shape)
+
+    # offsets (b, h, w, 2*n) --> (b, h, w, n, 2)
+    offsets = tf.reshape(offsets, (batch_size, input_h, input_w, kernel_n, 2))
+    # offsets (b, h, w, n, 2) --> (b*c, h, w, n, 2)
+    # offsets = tf.tile(offsets, [channel, 1, 1, 1, 1])
+
+    coords = tf.expand_dims(grid_offset, 0)  # grid_offset --> (1, h, w, n, 2)
+    coords = tf.tile(coords, [batch_size, 1, 1, 1, 1]) + offsets  # grid_offset --> (b, h, w, n, 2)
+
+    # clip out of bound
+    coords = tf.stack([tf.clip_by_value(coords[:, :, :, :, 0], 0.0, tf.cast(input_h - 1, 'float32')),
+                       tf.clip_by_value(coords[:, :, :, :, 1], 0.0, tf.cast(input_w - 1, 'float32'))], axis=-1)
+    coords = tf.tile(coords, [channel, 1, 1, 1, 1])
+
+    mapped_vals = tf_batch_map_coordinates(inputs, coords)
+    # (b*c, h, w, n) --> (b, h, w, n, c)
+    mapped_vals = _to_b_h_w_n_c(mapped_vals, [batch_size, input_h, input_w, kernel_n, channel])
+
+    return mapped_vals
+
+class DeformableConv2dLayer(Layer):
+    """The :class:`DeformableConv2dLayer` class is a
+    `Deformable Convolutional Networks <https://arxiv.org/abs/1703.06211>`_ .
+
+    Parameters
+    -----------
+    layer : TensorLayer layer.
+    offset_layer : TensorLayer layer, to predict the offset of convolutional operations. The shape of its output should be (batchsize, input height, input width, 2*(number of element in the convolutional kernel))
+        e.g. if apply a 3*3 kernel, the number of the last dimension should be 18 (2*3*3)
+    channel_multiplier : int, The number of channels to expand to.
+    filter_size : tuple (height, width) for filter size.
+    strides : tuple (height, width) for strides. Current implementation fix to (1, 1, 1, 1)
+    act : None or activation function.
+    shape : list of shape
+        shape of the filters, [filter_height, filter_width, in_channels, out_channels].
+    W_init : weights initializer
+        The initializer for initializing the weight matrix.
+    b_init : biases initializer or None
+        The initializer for initializing the bias vector. If None, skip biases.
+    W_init_args : dictionary
+        The arguments for the weights tf.get_variable().
+    b_init_args : dictionary
+        The arguments for the biases tf.get_variable().
+    name : a string or None
+        An optional name to attach to this layer.
+
+    Examples
+    --------
+    >>> network = tl.layers.InputLayer(x, name='input_layer')
+    >>> offset_1 = tl.layers.Conv2dLayer(layer=network, act=act, shape=[3, 3, 3, 18], strides=[1, 1, 1, 1],padding='SAME', name='offset_layer1')
+    >>> network = tl.layers.DeformableConv2dLayer(layer=network, act=act, offset_layer=offset_1,  shape=[3, 3, 3, 32],  name='deformable_conv_2d_layer1')
+    >>> offset_2 = tl.layers.Conv2dLayer(layer=network, act=act, shape=[3, 3, 32, 18], strides=[1, 1, 1, 1], padding='SAME', name='offset_layer2')
+    >>> network = tl.layers.DeformableConv2dLayer(layer=network, act = act, offset_layer=offset_2, shape=[3, 3, 32, 64], name='deformable_conv_2d_layer2')
+
+    References
+    -----------
+    - The deformation operation was adapted from the implementation in `<https://github.com/felixlaumon/deform-conv>`_
+
+    Notes
+    -----------
+    - The stride is fixed as (1, 1, 1, 1).
+    - The padding is fixed as 'SAME'.
+    - The current implementation is memory-inefficient, please use carefully.
+    """
+    def __init__(
+            self,
+            layer=None,
+            act=tf.identity,
+            offset_layer=None,
+            shape=[3, 3, 1, 100],
+            name='deformable_conv_2d_layer',
+            W_init=tf.truncated_normal_initializer(stddev=0.02),
+            b_init=tf.constant_initializer(value=0.0),
+            W_init_args={},
+            b_init_args={}
+    ):
+        if tf.__version__ < "1.4":
+            raise Exception("Deformable CNN layer requires tensrflow 1.4 or higher version")
+
+        Layer.__init__(self, name=name)
+        self.inputs = layer.outputs
+        self.offset_layer = offset_layer
+
+        print("  [TL] DeformableConv2dLayer %s: shape:%s, act:%s" %
+              (self.name, str(shape), act.__name__))
+
+        with tf.variable_scope(name) as vs:
+
+            offset = self.offset_layer.outputs
+            assert offset.get_shape()[-1] == 2 * shape[0] * shape[1]
+
+            ## Grid initialisation
+            input_h = int(self.inputs.get_shape()[1])
+            input_w = int(self.inputs.get_shape()[2])
+            kernel_n = shape[0] * shape[1]
+            initial_offsets = tf.stack(tf.meshgrid(tf.range(shape[0]),
+                                                   tf.range(shape[1]),
+                                                   indexing='ij'))  # initial_offsets --> (kh, kw, 2)
+            initial_offsets = tf.reshape(initial_offsets, (-1, 2))  # initial_offsets --> (n, 2)
+            initial_offsets = tf.expand_dims(initial_offsets, 0)  # initial_offsets --> (1, n, 2)
+            initial_offsets = tf.expand_dims(initial_offsets, 0)  # initial_offsets --> (1, 1, n, 2)
+            initial_offsets = tf.tile(initial_offsets, [input_h, input_w, 1, 1])  # initial_offsets --> (h, w, n, 2)
+            initial_offsets = tf.cast(initial_offsets, 'float32')
+            grid = tf.meshgrid(
+                tf.range(- int((shape[0] - 1)/2.0), int(input_h - int((shape[0] - 1)/2.0)), 1),
+                tf.range(- int((shape[1] - 1)/2.0), int(input_w - int((shape[1] - 1)/2.0)), 1), indexing='ij')
+
+            grid = tf.stack(grid, axis=-1)
+            grid = tf.cast(grid, 'float32')  # grid --> (h, w, 2)
+            grid = tf.expand_dims(grid, 2)  # grid --> (h, w, 1, 2)
+            grid = tf.tile(grid, [1, 1, kernel_n, 1])  # grid --> (h, w, n, 2)
+            grid_offset = grid + initial_offsets  # grid_offset --> (h, w, n, 2)
+
+            input_deform = tf_batch_map_offsets(self.inputs, offset, grid_offset)
+
+            W = tf.get_variable(name='W_conv2d', shape=[1, 1, shape[0] * shape[1], shape[-2], shape[-1]],
+                              initializer=W_init, dtype=D_TYPE, **W_init_args)
+            b = tf.get_variable(name='b_conv2d', shape=(shape[-1]),
+                                initializer=b_init, dtype=D_TYPE, **b_init_args)
+
+            self.outputs = tf.reshape(act(
+                tf.nn.conv3d(input_deform, W, strides=[1, 1, 1, 1, 1], padding='VALID', name=None) + b),
+                (tf.shape(self.inputs)[0], input_h, input_w, shape[-1]))
+
+        ## fixed
+        self.all_layers = list(layer.all_layers)
+        self.all_params = list(layer.all_params)
+        self.all_drop = dict(layer.all_drop)
+
+        ## offset_layer
+        offset_params = [osparam for osparam in offset_layer.all_params if osparam not in layer.all_params]
+        offset_layers = [oslayer for oslayer in offset_layer.all_layers if oslayer not in layer.all_layers]
+
+        self.all_params.extend(offset_params)
+        self.all_layers.extend(offset_layers)
+        self.all_drop.update(offset_layer.all_drop)
+
+        ## this layer
+        self.all_layers.extend([self.outputs])
+        self.all_params.extend([W, b])
+
 def AtrousConv1dLayer(net, n_filter=32, filter_size=2, stride=1, dilation=1, act=None,
         padding='SAME', use_cudnn_on_gpu=None,data_format='NWC',
         W_init = tf.truncated_normal_initializer(stddev=0.02),
@@ -1661,7 +2011,6 @@ def AtrousConv1dLayer(net, n_filter=32, filter_size=2, stride=1, dilation=1, act
         )
     return net
 
-
 class AtrousConv2dLayer(Layer):
     """The :class:`AtrousConv2dLayer` class is Atrous convolution (a.k.a. convolution with holes or dilated convolution) 2D layer, see `tf.nn.atrous_conv2d <https://www.tensorflow.org/versions/master/api_docs/python/nn.html#atrous_conv2d>`_.
 
@@ -1696,15 +2045,15 @@ def __init__(
     ):
         Layer.__init__(self, name=name)
         self.inputs = layer.outputs
-        print("  [TL] AtrousConv2dLayer %s: n_filter:%d filter_size:%s rate:%d pad:%s act:%s" %
-                            (self.name, n_filter, filter_size, rate, padding, act.__name__))
         if act is None:
             act = tf.identity
+        print("  [TL] AtrousConv2dLayer %s: n_filter:%d filter_size:%s rate:%d pad:%s act:%s" %
+                            (self.name, n_filter, filter_size, rate, padding, act.__name__))
         with tf.variable_scope(name) as vs:
             shape = [filter_size[0], filter_size[1], int(self.inputs.get_shape()[-1]), n_filter]
-            filters = tf.get_variable(name='filter', shape=shape, initializer=W_init, **W_init_args )
+            filters = tf.get_variable(name='filter', shape=shape, initializer=W_init, dtype=D_TYPE, **W_init_args )
             if b_init:
-                b = tf.get_variable(name='b', shape=(n_filter), initializer=b_init, **b_init_args )
+                b = tf.get_variable(name='b', shape=(n_filter), initializer=b_init, dtype=D_TYPE, **b_init_args )
                 self.outputs = act(tf.nn.atrous_conv2d(self.inputs, filters, rate, padding) + b)
             else:
                 self.outputs = act(tf.nn.atrous_conv2d(self.inputs, filters, rate, padding))
@@ -1792,6 +2141,7 @@ def __init__(
         self.all_layers.extend( [self.outputs] )
         self.all_params.extend( variables )
 
+
 ## Initializers for Convuolutional Layers
 def deconv2d_bilinear_upsampling_initializer(shape):
     """Returns initializer that can be passed to DeConv2dLayer to initalize the
@@ -1851,7 +2201,7 @@ def deconv2d_bilinear_upsampling_initializer(shape):
         weights[:, :, i, i] = bilinear_kernel
 
     #assign numpy array to constant_initalizer and pass to get_variable
-    bilinear_weights_init = tf.constant_initializer(value=weights, dtype=tf.float32)
+    bilinear_weights_init = tf.constant_initializer(value=weights, dtype=D_TYPE) #dtype=tf.float32)
     return bilinear_weights_init
 
 ## Convolutional layer (Simplified)
@@ -1871,6 +2221,23 @@ def Conv1d(net, n_filter=32, filter_size=5, stride=1, dilation_rate=1, act=None,
     dilation_rate : As it is 1D conv, the default is "NWC".
     act : None or activation function.
     others : see :class:`Conv1dLayer`.
+
+    Examples
+    ---------
+    >>> x = tf.placeholder(tf.float32, [batch_size, width])
+    >>> y_ = tf.placeholder(tf.int64, shape=[batch_size,])
+    >>> n = InputLayer(x, name='in')
+    >>> n = ReshapeLayer(n, [-1, width, 1], name='rs')
+    >>> n = Conv1d(n, 64, 3, 1, act=tf.nn.relu, name='c1')
+    >>> n = MaxPool1d(n, 2, 2, padding='valid', name='m1')
+    >>> n = Conv1d(n, 128, 3, 1, act=tf.nn.relu, name='c2')
+    >>> n = MaxPool1d(n, 2, 2, padding='valid', name='m2')
+    >>> n = Conv1d(n, 128, 3, 1, act=tf.nn.relu, name='c3')
+    >>> n = MaxPool1d(n, 2, 2, padding='valid', name='m3')
+    >>> n = FlattenLayer(n, name='f')
+    >>> n = DenseLayer(n, 500, tf.nn.relu, name='d1')
+    >>> n = DenseLayer(n, 100, tf.nn.relu, name='d2')
+    >>> n = DenseLayer(n, 2, tf.identity, name='o')
     """
     if act is None:
         act = tf.identity
@@ -1959,14 +2326,14 @@ def DeConv2d(net, n_out_channel = 32, filter_size=(3, 3),
     assert len(strides) == 2, "len(strides) should be 2, DeConv2d and DeConv2dLayer are different."
     if act is None:
         act = tf.identity
-    # if batch_size is None:
+    if batch_size is None:
     #     batch_size = tf.shape(net.outputs)[0]
-    fixed_batch_size = net.outputs.get_shape().with_rank_at_least(1)[0]
-    if fixed_batch_size.value:
-        batch_size = fixed_batch_size.value
-    else:
-        from tensorflow.python.ops import array_ops
-        batch_size = array_ops.shape(net.outputs)[0]
+        fixed_batch_size = net.outputs.get_shape().with_rank_at_least(1)[0]
+        if fixed_batch_size.value:
+            batch_size = fixed_batch_size.value
+        else:
+            from tensorflow.python.ops import array_ops
+            batch_size = array_ops.shape(net.outputs)[0]
     net = DeConv2dLayer(layer = net,
                     act = act,
                     shape = [filter_size[0], filter_size[1], n_out_channel, int(net.outputs.get_shape()[-1])],
@@ -2112,10 +2479,108 @@ def MeanPool3d(net, filter_size, strides, padding='valid', data_format='channels
     net_new.all_layers.extend( [outputs] )
     return net_new
 
+class DepthwiseConv2d(Layer):
+    """Separable/Depthwise Convolutional 2D, see `tf.nn.depthwise_conv2d <https://www.tensorflow.org/versions/master/api_docs/python/tf/nn/depthwise_conv2d>`_.
+
+    Input:
+        4-D Tensor [batch, height, width, in_channels].
+    Output:
+        4-D Tensor [batch, new height, new width, in_channels * channel_multiplier].
+
+    Parameters
+    ------------
+    net : TensorLayer layer.
+    channel_multiplier : int, The number of channels to expand to.
+    filter_size : tuple (height, width) for filter size.
+    strides : tuple (height, width) for strides.
+    act : None or activation function.
+    padding : a string from: "SAME", "VALID".
+        The type of padding algorithm to use.
+    W_init : weights initializer
+        The initializer for initializing the weight matrix.
+    b_init : biases initializer or None
+        The initializer for initializing the bias vector. If None, skip biases.
+    W_init_args : dictionary
+        The arguments for the weights tf.get_variable().
+    b_init_args : dictionary
+        The arguments for the biases tf.get_variable().
+    name : a string or None
+        An optional name to attach to this layer.
+
+    Examples
+    ---------
+    >>> t_im = tf.placeholder("float32", [None, 256, 256, 3])
+    >>> net = InputLayer(t_im, name='in')
+    >>> net = DepthwiseConv2d(net, 32, (3, 3), (1, 1, 1, 1), tf.nn.relu, padding="SAME", name='dep')
+    >>> print(net.outputs.get_shape())
+    ... (?, 256, 256, 96)
+
+    References
+    -----------
+    - tflearn's `grouped_conv_2d <https://github.com/tflearn/tflearn/blob/3e0c3298ff508394f3ef191bcd7d732eb8860b2e/tflearn/layers/conv.py>`_
+    - keras's `separableconv2d <https://keras.io/layers/convolutional/#separableconv2d>`_
+    """
+    def __init__(
+        self,
+        layer = None,
+        # n_filter = 32,
+        channel_multiplier = 3,
+        shape = (3, 3),
+        strides = (1, 1),
+        act = None,
+        padding='SAME',
+        W_init = tf.truncated_normal_initializer(stddev=0.02),
+        b_init = tf.constant_initializer(value=0.0),
+        W_init_args = {},
+        b_init_args = {},
+        name ='depthwise_conv2d',
+    ):
+        Layer.__init__(self, name=name)
+        self.inputs = layer.outputs
+
+        if act is None:
+            act = tf.identity
+
+        print("  [TL] DepthwiseConv2d %s: shape:%s strides:%s pad:%s act:%s" %
+                            (self.name, str(shape), str(strides), padding, act.__name__))
+
+        if act is None:
+            act = tf.identity
+
+        try:
+            pre_channel = int(layer.outputs.get_shape()[-1])
+        except: # if pre_channel is ?, it happens when using Spatial Transformer Net
+            pre_channel = 1
+            print("[warnings] unknow input channels, set to 1")
+
+        shape = [shape[0], shape[1], pre_channel, channel_multiplier]
+
+        if len(strides) == 2:
+            strides = [1, strides[0], strides[1], 1]
+
+        assert len(strides) == 4, "len(strides) should be 4."
+
+        with tf.variable_scope(name) as vs:
+            W = tf.get_variable(name='W_sepconv2d', shape=shape, initializer=W_init, dtype=D_TYPE, **W_init_args ) # [filter_height, filter_width, in_channels, channel_multiplier]
+            if b_init:
+                b = tf.get_variable(name='b_sepconv2d', shape=(pre_channel*channel_multiplier), initializer=b_init, dtype=D_TYPE, **b_init_args )
+                self.outputs = act( tf.nn.depthwise_conv2d(self.inputs, W, strides=strides, padding=padding) + b )
+            else:
+                self.outputs = act( tf.nn.depthwise_conv2d(self.inputs, W, strides=strides, padding=padding) )
+
+        self.all_layers = list(layer.all_layers)
+        self.all_params = list(layer.all_params)
+        self.all_drop = dict(layer.all_drop)
+        self.all_layers.extend( [self.outputs] )
+        if b_init:
+            self.all_params.extend( [W, b] )
+        else:
+            self.all_params.extend( [W] )
+
 ## Super resolution
 def SubpixelConv2d(net, scale=2, n_out_channel=None, act=tf.identity, name='subpixel_conv2d'):
-    """The :class:`SubpixelConv2d` class is a sub-pixel 2d convolutional ayer, usually be used
-    for Super-Resolution applications, `example code <https://github.com/zsdonghao/SRGAN/>`_.
+    """It is a sub-pixel 2d upsampling layer, usually be used
+    for Super-Resolution applications, see `example code <https://github.com/zsdonghao/SRGAN/>`_.
 
     Parameters
     ------------
@@ -2165,16 +2630,21 @@ def SubpixelConv2d(net, scale=2, n_out_channel=None, act=tf.identity, name='subp
 
     scope_name = tf.get_variable_scope().name
     if scope_name:
-        name = scope_name + '/' + name
+        whole_name = scope_name + '/' + name
+    else:
+        whole_name = name
 
     def _PS(X, r, n_out_channel):
         if n_out_channel >= 1:
             assert int(X.get_shape()[-1]) == (r ** 2) * n_out_channel, _err_log
+            '''
             bsize, a, b, c = X.get_shape().as_list()
             bsize = tf.shape(X)[0] # Handling Dimension(None) type for undefined batch dim
             Xs=tf.split(X,r,3) #b*h*w*r*r
             Xr=tf.concat(Xs,2) #b*h*(r*w)*r
             X=tf.reshape(Xr,(bsize,r*a,r*b,n_out_channel)) # b*(r*h)*(r*w)*c
+            '''
+            X=tf.depth_to_space(X,r)
         else:
             print(_err_log)
         return X
@@ -2187,7 +2657,7 @@ def _PS(X, r, n_out_channel):
 
     print("  [TL] SubpixelConv2d  %s: scale: %d n_out_channel: %s act: %s" % (name, scale, n_out_channel, act.__name__))
 
-    net_new = Layer(inputs, name=name)
+    net_new = Layer(inputs, name=whole_name)
     # with tf.name_scope(name):
     with tf.variable_scope(name) as vs:
         net_new.outputs = act(_PS(inputs, r=scale, n_out_channel=n_out_channel))
@@ -2199,7 +2669,7 @@ def _PS(X, r, n_out_channel):
     return net_new
 
 def SubpixelConv2d_old(net, scale=2, n_out_channel=None, act=tf.identity, name='subpixel_conv2d'):
-    """The :class:`SubpixelConv2d` class is a sub-pixel 2d convolutional ayer, usually be used
+    """It is a sub-pixel 2d upsampling layer, usually be used
     for Super-Resolution applications, `example code <https://github.com/zsdonghao/SRGAN/>`_.
 
     Parameters
@@ -2252,31 +2722,12 @@ def SubpixelConv2d_old(net, scale=2, n_out_channel=None, act=tf.identity, name='
     if scope_name:
         name = scope_name + '/' + name
 
-    def _phase_shift(I, r):
-        if tf.__version__ < '1.0':
-            raise Exception("Only support TF1.0+")
-        bsize, a, b, c = I.get_shape().as_list()
-        bsize = tf.shape(I)[0] # Handling Dimension(None) type for undefined batch dim
-        X = tf.reshape(I, (bsize, a, b, r, r))
-        X = tf.transpose(X, (0, 1, 2, 4, 3))  # bsize, a, b, 1, 1 # tf 0.12
-        # X = tf.split(1, a, X)  # a, [bsize, b, r, r] # tf 0.12
-        X = tf.split(X, a, 1)
-        # X = tf.concat(2, [tf.squeeze(x, axis=1) for x in X])  # bsize, b, a*r, r # tf 0.12
-        X = tf.concat([tf.squeeze(x, axis=1) for x in X], 2)
-        # X = tf.split(1, b, X)  # b, [bsize, a*r, r] # tf 0.12
-        X = tf.split(X, b, 1)
-        # X = tf.concat(2, [tf.squeeze(x, axis=1) for x in X])  # bsize, a*r, b*r # tf 0.12
-        X = tf.concat([tf.squeeze(x, axis=1) for x in X], 2)
-        return tf.reshape(X, (bsize, a*r, b*r, 1))
-
     def _PS(X, r, n_out_channel):
         if n_out_channel > 1:
             assert int(X.get_shape()[-1]) == (r ** 2) * n_out_channel, _err_log
-            Xc = tf.split(X, n_out_channel, 3)
-            X = tf.concat([_phase_shift(x, r) for x in Xc], 3)
-        elif n_out_channel == 1:
-            assert int(X.get_shape()[-1]) == (r ** 2), _err_log
-            X = _phase_shift(X, r)
+            X=tf.transpose(X,[0,2,1,3])
+            X=tf.depth_to_space(X,r)
+            X=tf.transpose(X,[0,2,1,3])
         else:
             print(_err_log)
         return X
@@ -2301,7 +2752,49 @@ def _PS(X, r, n_out_channel):
     return net_new
 
 
+def SubpixelConv1d(net, scale=2, act=tf.identity, name='subpixel_conv1d'):
+    """One-dimensional subpixel upsampling layer.
+    Calls a tensorflow function that directly implements this functionality.
+    We assume input has dim (batch, width, r)
+
+    Parameters
+    ------------
+    net : TensorLayer layer.
+    scale : int, upscaling ratio, a wrong setting will lead to Dimension size error.
+    act : activation function.
+    name : string.
+        An optional name to attach to this layer.
+
+    Examples
+    ----------
+    >>> t_signal = tf.placeholder('float32', [10, 100, 4], name='x')
+    >>> n = InputLayer(t_signal, name='in')
+    >>> n = SubpixelConv1d(n, scale=2, name='s')
+    >>> print(n.outputs.shape)
+    ... (10, 200, 2)
+
+    References
+    -----------
+    - `Audio Super Resolution Implementation <https://github.com/kuleshov/audio-super-res/blob/master/src/models/layers/subpixel.py>`_.
+    """
+    def _PS(I, r):
+        X = tf.transpose(I, [2,1,0]) # (r, w, b)
+        X = tf.batch_to_space_nd(X, [r], [[0,0]]) # (1, r*w, b)
+        X = tf.transpose(X, [2,1,0])
+        return X
+
+    print("  [TL] SubpixelConv1d  %s: scale: %d act: %s" % (name, scale, act.__name__))
 
+    inputs = net.outputs
+    net_new = Layer(inputs, name=name)
+    with tf.name_scope(name):
+        net_new.outputs = act(_PS(inputs, r=scale))
+
+    net_new.all_layers = list(net.all_layers)
+    net_new.all_params = list(net.all_params)
+    net_new.all_drop = dict(net.all_drop)
+    net_new.all_layers.extend( [net_new.outputs] )
+    return net_new
 
 ## Spatial Transformer Nets
 def transformer(U, theta, out_size, name='SpatialTransformer2dAffine', **kwargs):
@@ -2521,10 +3014,10 @@ def __init__(
             # 2.1 W
             n_in = int(self.theta_layer.outputs.get_shape()[-1])
             shape = (n_in, 6)
-            W = tf.get_variable(name='W', initializer=tf.zeros(shape))
+            W = tf.get_variable(name='W', initializer=tf.zeros(shape), dtype=D_TYPE)
             # 2.2 b
             identity = tf.constant(np.array([[1., 0, 0], [0, 1., 0]]).astype('float32').flatten())
-            b = tf.get_variable(name='b', initializer=identity)
+            b = tf.get_variable(name='b', initializer=identity, dtype=D_TYPE)
             # 2.3 transformation matrix
             self.theta = tf.nn.tanh(tf.matmul(self.theta_layer.outputs, W) + b)
             ## 3. Spatial Transformer Sampling
@@ -2620,6 +3113,7 @@ class BatchNormLayer(Layer):
         The initializer for initializing beta
     gamma_init : gamma initializer
         The initializer for initializing gamma
+    dtype : tf.float32 (default) or tf.float16
     name : a string or None
         An optional name to attach to this layer.
 
@@ -2637,6 +3131,7 @@ def __init__(
         is_train = False,
         beta_init = tf.zeros_initializer,
         gamma_init = tf.random_normal_initializer(mean=1.0, stddev=0.002), # tf.ones_initializer,
+        # dtype = tf.float32,
         name ='batchnorm_layer',
     ):
         Layer.__init__(self, name=name)
@@ -2657,10 +3152,13 @@ def __init__(
                 beta_init = beta_init()
             beta = tf.get_variable('beta', shape=params_shape,
                                initializer=beta_init,
+                               dtype=D_TYPE,
                                trainable=is_train)#, restore=restore)
 
             gamma = tf.get_variable('gamma', shape=params_shape,
-                                initializer=gamma_init, trainable=is_train,
+                                initializer=gamma_init,
+                                dtype=D_TYPE,
+                                trainable=is_train,
                                 )#restore=restore)
 
             ## 2.
@@ -2671,10 +3169,12 @@ def __init__(
             moving_mean = tf.get_variable('moving_mean',
                                       params_shape,
                                       initializer=moving_mean_init,
-                                      trainable=False,)#   restore=restore)
+                                      dtype=D_TYPE,
+                                      trainable=False)#   restore=restore)
             moving_variance = tf.get_variable('moving_variance',
                                           params_shape,
                                           initializer=tf.constant_initializer(1.),
+                                          dtype=D_TYPE,
                                           trainable=False,)#   restore=restore)
 
             ## 3.
@@ -3431,29 +3931,130 @@ def mean_var_with_update():
 #         self.all_layers.extend( [self.outputs] )
 #         self.all_params.extend( [beta, gamma] )
 
-## Pooling layer
-class PoolLayer(Layer):
-    """
-    The :class:`PoolLayer` class is a Pooling layer, you can choose
-    ``tf.nn.max_pool`` and ``tf.nn.avg_pool`` for 2D or
-    ``tf.nn.max_pool3d`` and ``tf.nn.avg_pool3d`` for 3D.
+class InstanceNormLayer(Layer):
+    """The :class:`InstanceNormLayer` class is a for instance normalization.
 
     Parameters
-    ----------
+    -----------
     layer : a :class:`Layer` instance
         The `Layer` class feeding into this layer.
-    ksize : a list of ints that has length >= 4.
-        The size of the window for each dimension of the input tensor.
-    strides : a list of ints that has length >= 4.
-        The stride of the sliding window for each dimension of the input tensor.
-    padding : a string from: "SAME", "VALID".
-        The type of padding algorithm to use.
-    pool : a pooling function
-        - see `TensorFlow pooling APIs <https://www.tensorflow.org/versions/master/api_docs/python/nn.html#pooling>`_
-        - class ``tf.nn.max_pool``
-        - class ``tf.nn.avg_pool``
-        - class ``tf.nn.max_pool3d``
-        - class ``tf.nn.avg_pool3d``
+    act : activation function.
+    epsilon : float
+        A small float number.
+    scale_init : beta initializer
+        The initializer for initializing beta
+    offset_init : gamma initializer
+        The initializer for initializing gamma
+    name : a string or None
+        An optional name to attach to this layer.
+    """
+    def __init__(
+    self,
+    layer = None,
+    act = tf.identity,
+    epsilon = 1e-5,
+    scale_init = tf.truncated_normal_initializer(mean=1.0, stddev=0.02),
+    offset_init = tf.constant_initializer(0.0),
+    name ='instan_norm',
+    ):
+        Layer.__init__(self, name=name)
+        self.inputs = layer.outputs
+        print("  [TL] InstanceNormLayer %s: epsilon:%f act:%s" %
+                            (self.name, epsilon, act.__name__))
+
+        with tf.variable_scope(name) as vs:
+            mean, var = tf.nn.moments(self.inputs, [1, 2], keep_dims=True)
+            scale = tf.get_variable('scale',[self.inputs.get_shape()[-1]],
+                initializer=tf.truncated_normal_initializer(mean=1.0, stddev=0.02), dtype=D_TYPE)
+            offset = tf.get_variable('offset',[self.inputs.get_shape()[-1]],
+                initializer=tf.constant_initializer(0.0), dtype=D_TYPE)
+            self.outputs = scale * tf.div(self.inputs-mean, tf.sqrt(var+epsilon)) + offset
+            self.outputs = act(self.outputs)
+            variables = tf.get_collection(TF_GRAPHKEYS_VARIABLES, scope=vs.name)
+
+        self.all_layers = list(layer.all_layers)
+        self.all_params = list(layer.all_params)
+        self.all_drop = dict(layer.all_drop)
+        self.all_layers.extend( [self.outputs] )
+        self.all_params.extend( variables )
+
+class LayerNormLayer(Layer):
+    """
+    The :class:`LayerNormLayer` class is for layer normalization, see `tf.contrib.layers.layer_norm <https://www.tensorflow.org/api_docs/python/tf/contrib/layers/layer_norm>`_.
+
+    Parameters
+    ----------
+    layer : a :class:`Layer` instance
+        The `Layer` class feeding into this layer.
+    act : activation function
+        The function that is applied to the layer activations.
+    others : see  `tf.contrib.layers.layer_norm <https://www.tensorflow.org/api_docs/python/tf/contrib/layers/layer_norm>`_
+    """
+    def __init__(self,
+                layer=None,
+                center=True,
+                scale=True,
+                act=tf.identity,
+                reuse=None,
+                variables_collections=None,
+                outputs_collections=None,
+                trainable=True,
+                begin_norm_axis=1,
+                begin_params_axis=-1,
+                name='layernorm'
+                ):
+
+        if tf.__version__ < "1.3":
+            raise Exception("Please use TF 1.3+")
+
+        Layer.__init__(self, name=name)
+        self.inputs = layer.outputs
+        print("  [TL] LayerNormLayer %s: act:%s" %
+                            (self.name, act.__name__))
+        with tf.variable_scope(name) as vs:
+            self.outputs = tf.contrib.layers.layer_norm(self.inputs,
+                center=center,
+                scale=scale,
+                activation_fn=act,
+                reuse=reuse,
+                variables_collections=variables_collections,
+                outputs_collections=outputs_collections,
+                trainable=trainable,
+                begin_norm_axis=begin_norm_axis,
+                begin_params_axis=begin_params_axis,
+                scope='var',
+                )
+            variables = tf.get_collection(TF_GRAPHKEYS_VARIABLES, scope=vs.name)
+
+        self.all_layers = list(layer.all_layers)
+        self.all_params = list(layer.all_params)
+        self.all_drop = dict(layer.all_drop)
+        self.all_layers.extend( [self.outputs] )
+        self.all_params.extend( variables )
+
+## Pooling layer
+class PoolLayer(Layer):
+    """
+    The :class:`PoolLayer` class is a Pooling layer, you can choose
+    ``tf.nn.max_pool`` and ``tf.nn.avg_pool`` for 2D or
+    ``tf.nn.max_pool3d`` and ``tf.nn.avg_pool3d`` for 3D.
+
+    Parameters
+    ----------
+    layer : a :class:`Layer` instance
+        The `Layer` class feeding into this layer.
+    ksize : a list of ints that has length >= 4.
+        The size of the window for each dimension of the input tensor.
+    strides : a list of ints that has length >= 4.
+        The stride of the sliding window for each dimension of the input tensor.
+    padding : a string from: "SAME", "VALID".
+        The type of padding algorithm to use.
+    pool : a pooling function
+        - see `TensorFlow pooling APIs <https://www.tensorflow.org/versions/master/api_docs/python/nn.html#pooling>`_
+        - class ``tf.nn.max_pool``
+        - class ``tf.nn.avg_pool``
+        - class ``tf.nn.max_pool3d``
+        - class ``tf.nn.avg_pool3d``
     name : a string or None
         An optional name to attach to this layer.
 
@@ -3508,7 +4109,7 @@ def __init__(
         assert paddings is not None, "paddings should be a Tensor of type int32. see https://www.tensorflow.org/api_docs/python/tf/pad"
         self.inputs = layer.outputs
         print("  [TL] PadLayer   %s: paddings:%s mode:%s" %
-                            (self.name, list(paddings.get_shape()), mode))
+                            (self.name, list(paddings), mode))
 
         self.outputs = tf.pad(self.inputs, paddings=paddings, mode=mode, name=name)
 
@@ -3517,6 +4118,49 @@ def __init__(
         self.all_drop = dict(layer.all_drop)
         self.all_layers.extend( [self.outputs] )
 
+## Object Detection
+class ROIPoolingLayer(Layer):
+    """
+    The :class:`ROIPoolingLayer` class is Region of interest pooling layer.
+
+    Parameters
+    -----------
+    layer : a :class:`Layer` instance
+        The `Layer` class feeding into this layer, the feature maps on which to perform the pooling operation
+    rois : list of regions of interest in the format (feature map index, upper left, bottom right)
+    pool_width : int, size of the pooling sections.
+    pool_width : int, size of the pooling sections.
+
+    Notes
+    -----------
+    - This implementation is from `Deepsense-AI <https://github.com/deepsense-ai/roi-pooling>`_ .
+    - Please install it by the instruction `HERE <https://github.com/zsdonghao/tensorlayer/blob/master/tensorlayer/third_party/roi_pooling/README.md>`_.
+    """
+    def __init__(
+        self,
+        #inputs = None,
+        layer = None,
+        rois = None,
+        pool_height = 2,
+        pool_width = 2,
+        name = 'roipooling_layer',
+    ):
+        Layer.__init__(self, name=name)
+        self.inputs = layer.outputs
+        print ("  [TL] ROIPoolingLayer %s: (%d, %d)" % (self.name, pool_height, pool_width))
+        try:
+            from tensorlayer.third_party.roi_pooling.roi_pooling.roi_pooling_ops import roi_pooling
+        except Exception as e:
+            print(e)
+            print("\nHINT: \n1. https://github.com/deepsense-ai/roi-pooling  \n2. tensorlayer/third_party/roi_pooling\n")
+        self.outputs = roi_pooling(self.inputs, rois, pool_height, pool_width)
+
+        self.all_layers = list(layer.all_layers)
+        self.all_params = list(layer.all_params)
+        self.all_drop = dict(layer.all_drop)
+        self.all_layers.extend( [self.outputs] )
+
+
 ## TimeDistributedLayer
 class TimeDistributedLayer(Layer):
     """
@@ -3576,8 +4220,8 @@ def __init__(
 
         with ops.suppress_stdout():
             for i in range(0, timestep):
-                with tf.variable_scope(name, reuse=(False if i==0 else True)) as vs:
-                    set_name_reuse((False if i==0 else True))
+                with tf.variable_scope(name, reuse=(set_keep['name_reuse'] if i==0 else True)) as vs:
+                    set_name_reuse((set_keep['name_reuse'] if i==0 else True))
                     net = layer_class(InputLayer(x[i], name=args['name']+str(i)), **args)
                     # net = layer_class(InputLayer(x[i], name="input_"+args['name']), **args)
                     x[i] = net.outputs
@@ -3607,11 +4251,11 @@ class RNNLayer(Layer):
         - see `RNN Cells in TensorFlow <https://www.tensorflow.org/api_docs/python/>`_
     cell_init_args : a dictionary
         The arguments for the cell initializer.
-    n_hidden : a int
+    n_hidden : an int
         The number of hidden units in the layer.
     initializer : initializer
         The initializer for initializing the parameters.
-    n_steps : a int
+    n_steps : an int
         The sequence length.
     initial_state : None or RNN State
         If None, initial_state is zero_state.
@@ -3626,7 +4270,7 @@ class RNNLayer(Layer):
     name : a string or None
         An optional name to attach to this layer.
 
-    Variables
+    Attributes
     --------------
     outputs : a tensor
         The output of this RNN.
@@ -3652,27 +4296,25 @@ class RNNLayer(Layer):
     --------
     - For words
     >>> input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
-    >>> network = tl.layers.EmbeddingInputlayer(
+    >>> net = tl.layers.EmbeddingInputlayer(
     ...                 inputs = input_data,
     ...                 vocabulary_size = vocab_size,
     ...                 embedding_size = hidden_size,
     ...                 E_init = tf.random_uniform_initializer(-init_scale, init_scale),
     ...                 name ='embedding_layer')
-    >>> if is_training:
-    >>>     network = tl.layers.DropoutLayer(network, keep=keep_prob, name='drop1')
-    >>> network = tl.layers.RNNLayer(network,
-    ...             cell_fn=tf.nn.rnn_cell.BasicLSTMCell,
+    >>> net = tl.layers.DropoutLayer(net, keep=keep_prob, is_fix=True, is_train=is_train, name='drop1')
+    >>> net = tl.layers.RNNLayer(net,
+    ...             cell_fn=tf.contrib.rnn.BasicLSTMCell,
     ...             cell_init_args={'forget_bias': 0.0},# 'state_is_tuple': True},
     ...             n_hidden=hidden_size,
     ...             initializer=tf.random_uniform_initializer(-init_scale, init_scale),
     ...             n_steps=num_steps,
     ...             return_last=False,
     ...             name='basic_lstm_layer1')
-    >>> lstm1 = network
-    >>> if is_training:
-    >>>     network = tl.layers.DropoutLayer(network, keep=keep_prob, name='drop2')
-    >>> network = tl.layers.RNNLayer(network,
-    ...             cell_fn=tf.nn.rnn_cell.BasicLSTMCell,
+    >>> lstm1 = net
+    >>> net = tl.layers.DropoutLayer(net, keep=keep_prob, is_fix=True, is_train=is_train, name='drop2')
+    >>> net = tl.layers.RNNLayer(net,
+    ...             cell_fn=tf.contrib.rnn.BasicLSTMCell,
     ...             cell_init_args={'forget_bias': 0.0}, # 'state_is_tuple': True},
     ...             n_hidden=hidden_size,
     ...             initializer=tf.random_uniform_initializer(-init_scale, init_scale),
@@ -3680,10 +4322,9 @@ class RNNLayer(Layer):
     ...             return_last=False,
     ...             return_seq_2d=True,
     ...             name='basic_lstm_layer2')
-    >>> lstm2 = network
-    >>> if is_training:
-    >>>     network = tl.layers.DropoutLayer(network, keep=keep_prob, name='drop3')
-    >>> network = tl.layers.DenseLayer(network,
+    >>> lstm2 = net
+    >>> net = tl.layers.DropoutLayer(net, keep=keep_prob, is_fix=True, is_train=is_train, name='drop3')
+    >>> net = tl.layers.DenseLayer(net,
     ...             n_units=vocab_size,
     ...             W_init=tf.random_uniform_initializer(-init_scale, init_scale),
     ...             b_init=tf.random_uniform_initializer(-init_scale, init_scale),
@@ -3691,34 +4332,34 @@ class RNNLayer(Layer):
 
     - For CNN+LSTM
     >>> x = tf.placeholder(tf.float32, shape=[batch_size, image_size, image_size, 1])
-    >>> network = tl.layers.InputLayer(x, name='input_layer')
-    >>> network = tl.layers.Conv2dLayer(network,
+    >>> net = tl.layers.InputLayer(x, name='input_layer')
+    >>> net = tl.layers.Conv2dLayer(net,
     ...                         act = tf.nn.relu,
     ...                         shape = [5, 5, 1, 32],  # 32 features for each 5x5 patch
     ...                         strides=[1, 2, 2, 1],
     ...                         padding='SAME',
     ...                         name ='cnn_layer1')
-    >>> network = tl.layers.PoolLayer(network,
+    >>> net = tl.layers.PoolLayer(net,
     ...                         ksize=[1, 2, 2, 1],
     ...                         strides=[1, 2, 2, 1],
     ...                         padding='SAME',
     ...                         pool = tf.nn.max_pool,
     ...                         name ='pool_layer1')
-    >>> network = tl.layers.Conv2dLayer(network,
+    >>> net = tl.layers.Conv2dLayer(net,
     ...                         act = tf.nn.relu,
     ...                         shape = [5, 5, 32, 10], # 10 features for each 5x5 patch
     ...                         strides=[1, 2, 2, 1],
     ...                         padding='SAME',
     ...                         name ='cnn_layer2')
-    >>> network = tl.layers.PoolLayer(network,
+    >>> net = tl.layers.PoolLayer(net,
     ...                         ksize=[1, 2, 2, 1],
     ...                         strides=[1, 2, 2, 1],
     ...                         padding='SAME',
     ...                         pool = tf.nn.max_pool,
     ...                         name ='pool_layer2')
-    >>> network = tl.layers.FlattenLayer(network, name='flatten_layer')
-    >>> network = tl.layers.ReshapeLayer(network, shape=[-1, num_steps, int(network.outputs._shape[-1])])
-    >>> rnn1 = tl.layers.RNNLayer(network,
+    >>> net = tl.layers.FlattenLayer(net, name='flatten_layer')
+    >>> net = tl.layers.ReshapeLayer(net, shape=[-1, num_steps, int(net.outputs._shape[-1])])
+    >>> rnn1 = tl.layers.RNNLayer(net,
     ...                         cell_fn=tf.nn.rnn_cell.LSTMCell,
     ...                         cell_init_args={},
     ...                         n_hidden=200,
@@ -3727,7 +4368,7 @@ class RNNLayer(Layer):
     ...                         return_last=False,
     ...                         return_seq_2d=True,
     ...                         name='rnn_layer')
-    >>> network = tl.layers.DenseLayer(rnn1, n_units=3,
+    >>> net = tl.layers.DenseLayer(rnn1, n_units=3,
     ...                         act = tl.activation.identity, name='output_layer')
 
     Notes
@@ -3814,7 +4455,7 @@ def __init__(
         else:
             self.cell = cell = cell_fn(num_units=n_hidden, **cell_init_args)
         if initial_state is None:
-            self.initial_state = cell.zero_state(batch_size, dtype=tf.float32)  # 1.2.3
+            self.initial_state = cell.zero_state(batch_size, dtype=D_TYPE)  #dtype=tf.float32)  # 1.2.3
         state = self.initial_state
         # with tf.variable_scope("model", reuse=None, initializer=initializer):
         with tf.variable_scope(name, initializer=initializer) as vs:
@@ -3871,11 +4512,11 @@ class BiRNNLayer(Layer):
         - see `RNN Cells in TensorFlow <https://www.tensorflow.org/api_docs/python/>`_
     cell_init_args : a dictionary
         The arguments for the cell initializer.
-    n_hidden : a int
+    n_hidden : an int
         The number of hidden units in the layer.
     initializer : initializer
         The initializer for initializing the parameters.
-    n_steps : a int
+    n_steps : an int
         The sequence length.
     fw_initial_state : None or forward RNN State
         If None, initial_state is zero_state.
@@ -3883,7 +4524,7 @@ class BiRNNLayer(Layer):
         If None, initial_state is zero_state.
     dropout : `tuple` of `float`: (input_keep_prob, output_keep_prob).
         The input and output keep probability.
-    n_layer : a int, default is 1.
+    n_layer : an int, default is 1.
         The number of RNN layers.
     return_last : boolean
         - If True, return the last output, "Sequence input and single output"
@@ -3896,7 +4537,7 @@ class BiRNNLayer(Layer):
     name : a string or None
         An optional name to attach to this layer.
 
-    Variables
+    Attributes
     --------------
     outputs : a tensor
         The output of this RNN.
@@ -4014,11 +4655,11 @@ def __init__(
 
             # Initial state of RNN
             if fw_initial_state is None:
-                self.fw_initial_state = self.fw_cell.zero_state(self.batch_size, dtype=tf.float32)
+                self.fw_initial_state = self.fw_cell.zero_state(self.batch_size, dtype=D_TYPE) # dtype=tf.float32)
             else:
                 self.fw_initial_state = fw_initial_state
             if bw_initial_state is None:
-                self.bw_initial_state = self.bw_cell.zero_state(self.batch_size, dtype=tf.float32)
+                self.bw_initial_state = self.bw_cell.zero_state(self.batch_size, dtype=D_TYPE) # dtype=tf.float32)
             else:
                 self.bw_initial_state = bw_initial_state
             # exit()
@@ -4041,6 +4682,7 @@ def __init__(
             )
 
             if return_last:
+                raise Exception("Do not support return_last at the moment.")
                 self.outputs = outputs[-1]
             else:
                 self.outputs = outputs
@@ -4072,6 +4714,300 @@ def __init__(
         self.all_layers.extend( [self.outputs] )
         self.all_params.extend( rnn_variables )
 
+
+# ConvLSTM layer
+class ConvRNNCell(object):
+    """Abstract object representing an Convolutional RNN Cell.
+    """
+
+    def __call__(self, inputs, state, scope=None):
+        """Run this RNN cell on inputs, starting from the given state.
+        """
+        raise NotImplementedError("Abstract method")
+
+    @property
+    def state_size(self):
+        """size(s) of state(s) used by this cell.
+        """
+        raise NotImplementedError("Abstract method")
+
+    @property
+    def output_size(self):
+        """Integer or TensorShape: size of outputs produced by this cell."""
+        raise NotImplementedError("Abstract method")
+
+    def zero_state(self, batch_size, dtype):
+        """Return zero-filled state tensor(s).
+        Args:
+          batch_size: int, float, or unit Tensor representing the batch size.
+          dtype: the data type to use for the state.
+        Returns:
+          tensor of shape '[batch_size x shape[0] x shape[1] x num_features]
+          filled with zeros
+        """
+
+        shape = self.shape
+        num_features = self.num_features
+        zeros = tf.zeros([batch_size, shape[0], shape[1], num_features * 2])
+        return zeros
+
+class BasicConvLSTMCell(ConvRNNCell):
+    """Basic Conv LSTM recurrent network cell.
+
+    Parameters
+    -----------
+    shape : int tuple thats the height and width of the cell
+    filter_size : int tuple thats the height and width of the filter
+    num_features : int thats the depth of the cell
+    forget_bias : float, The bias added to forget gates (see above).
+    input_size : Deprecated and unused.
+    state_is_tuple : If True, accepted and returned states are 2-tuples of
+        the `c_state` and `m_state`.  If False, they are concatenated
+        along the column axis.  The latter behavior will soon be deprecated.
+    activation : Activation function of the inner states.
+    """
+    def __init__(self, shape, filter_size, num_features, forget_bias=1.0, input_size=None,
+                 state_is_tuple=False, activation=tf.nn.tanh):
+        """Initialize the basic Conv LSTM cell.
+        """
+        # if not state_is_tuple:
+        # logging.warn("%s: Using a concatenated state is slower and will soon be "
+        #             "deprecated.  Use state_is_tuple=True.", self)
+        if input_size is not None:
+            logging.warn("%s: The input_size parameter is deprecated.", self)
+        self.shape = shape
+        self.filter_size = filter_size
+        self.num_features = num_features
+        self._forget_bias = forget_bias
+        self._state_is_tuple = state_is_tuple
+        self._activation = activation
+
+    @property
+    def state_size(self):
+        """ State size of the LSTMStateTuple. """
+        return (LSTMStateTuple(self._num_units, self._num_units)
+                if self._state_is_tuple else 2 * self._num_units)
+
+    @property
+    def output_size(self):
+        """ Number of units in outputs. """
+        return self._num_units
+
+    def __call__(self, inputs, state, scope=None):
+        """Long short-term memory cell (LSTM)."""
+        with tf.variable_scope(scope or type(self).__name__):  # "BasicLSTMCell"
+            # Parameters of gates are concatenated into one multiply for efficiency.
+            if self._state_is_tuple:
+                c, h = state
+            else:
+                # print state
+                # c, h = tf.split(3, 2, state)
+                c, h = tf.split(state, 2, 3)
+            concat = _conv_linear([inputs, h], self.filter_size, self.num_features * 4, True)
+
+            # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+            # i, j, f, o = tf.split(3, 4, concat)
+            i, j, f, o = tf.split(concat, 4, 3)
+
+            new_c = (c * tf.nn.sigmoid(f + self._forget_bias) + tf.nn.sigmoid(i) *
+                     self._activation(j))
+            new_h = self._activation(new_c) * tf.nn.sigmoid(o)
+
+            if self._state_is_tuple:
+                new_state = LSTMStateTuple(new_c, new_h)
+            else:
+                new_state = tf.concat([new_c, new_h], 3)
+            return new_h, new_state
+
+def _conv_linear(args, filter_size, num_features, bias, bias_start=0.0, scope=None):
+    """convolution:
+
+    Parameters
+    ----------
+      args: a 4D Tensor or a list of 4D, batch x n, Tensors.
+      filter_size: int tuple of filter height and width.
+      num_features: int, number of features.
+      bias_start: starting value to initialize the bias; 0 by default.
+      scope: VariableScope for the created subgraph; defaults to "Linear".
+
+    Returns
+    --------
+    - A 4D Tensor with shape [batch h w num_features]
+
+    Raises
+    -------
+    - ValueError : if some of the arguments has unspecified or wrong shape.
+    """
+
+    # Calculate the total size of arguments on dimension 1.
+    total_arg_size_depth = 0
+    shapes = [a.get_shape().as_list() for a in args]
+    for shape in shapes:
+        if len(shape) != 4:
+            raise ValueError("Linear is expecting 4D arguments: %s" % str(shapes))
+        if not shape[3]:
+            raise ValueError("Linear expects shape[4] of arguments: %s" % str(shapes))
+        else:
+            total_arg_size_depth += shape[3]
+
+    dtype = [a.dtype for a in args][0]
+
+    # Now the computation.
+    with tf.variable_scope(scope or "Conv"):
+        matrix = tf.get_variable(
+            "Matrix", [filter_size[0], filter_size[1], total_arg_size_depth, num_features], dtype=dtype)
+        if len(args) == 1:
+            res = tf.nn.conv2d(args[0], matrix, strides=[1, 1, 1, 1], padding='SAME')
+        else:
+            res = tf.nn.conv2d(tf.concat(args, 3), matrix, strides=[1, 1, 1, 1], padding='SAME')
+        if not bias:
+            return res
+        bias_term = tf.get_variable(
+            "Bias", [num_features],
+            dtype=dtype,
+            initializer=tf.constant_initializer(
+                bias_start, dtype=dtype))
+    return res + bias_term
+
+class ConvLSTMLayer(Layer):
+    """
+    The :class:`ConvLSTMLayer` class is a Convolutional LSTM layer,
+    see `Convolutional LSTM Layer <https://arxiv.org/abs/1506.04214>`_ .
+
+    Parameters
+    ----------
+    layer : a :class:`Layer` instance
+        The `Layer` class feeding into this layer.
+    cell_shape : tuple, the shape of each cell width*height
+    filter_size : tuple, the size of filter width*height
+    cell_fn : a Convolutional RNN cell as follow.
+    feature_map : a int
+        The number of feature map in the layer.
+    initializer : initializer
+        The initializer for initializing the parameters.
+    n_steps : a int
+        The sequence length.
+    initial_state : None or ConvLSTM State
+        If None, initial_state is zero_state.
+    return_last : boolen
+        - If True, return the last output, "Sequence input and single output"
+        - If False, return all outputs, "Synced sequence input and output"
+        - In other word, if you want to apply one or more ConvLSTM(s) on this layer, set to False.
+    return_seq_2d : boolen
+        - When return_last = False
+        - If True, return 4D Tensor [n_example, h, w, c], for stacking DenseLayer after it.
+        - If False, return 5D Tensor [n_example/n_steps, h, w, c], for stacking multiple ConvLSTM after it.
+    name : a string or None
+        An optional name to attach to this layer.
+
+    Variables
+    --------------
+    outputs : a tensor
+        The output of this RNN.
+        return_last = False, outputs = all cell_output, which is the hidden state.
+            cell_output.get_shape() = (?, h, w, c])
+
+    final_state : a tensor or StateTuple
+        When state_is_tuple = False,
+        it is the final hidden and cell states,
+        When state_is_tuple = True,
+        You can get the final state after each iteration during training, then
+        feed it to the initial state of next iteration.
+
+    initial_state : a tensor or StateTuple
+        It is the initial state of this ConvLSTM layer, you can use it to initialize
+        your state at the begining of each epoch or iteration according to your
+        training procedure.
+
+    batch_size : int or tensor
+        Is int, if able to compute the batch_size, otherwise, tensor for ``?``.
+    """
+    def __init__(
+            self,
+            layer=None,
+            cell_shape=None,
+            feature_map=1,
+            filter_size=(3, 3),
+            cell_fn=BasicConvLSTMCell,
+            initializer=tf.random_uniform_initializer(-0.1, 0.1),
+            n_steps=5,
+            initial_state=None,
+            return_last=False,
+            return_seq_2d=False,
+            name='convlstm_layer',
+    ):
+        Layer.__init__(self, name=name)
+        self.inputs = layer.outputs
+        print("  [TL] ConvLSTMLayer %s: feature_map:%d, n_steps:%d, "
+              "in_dim:%d %s, cell_fn:%s " % (self.name, feature_map,
+                                             n_steps, self.inputs.get_shape().ndims, self.inputs.get_shape(),
+                                             cell_fn.__name__))
+        # You can get the dimension by .get_shape() or ._shape, and check the
+        # dimension by .with_rank() as follow.
+        # self.inputs.get_shape().with_rank(2)
+        # self.inputs.get_shape().with_rank(3)
+
+        # Input dimension should be rank 5 [batch_size, n_steps(max), h, w, c]
+        try:
+            self.inputs.get_shape().with_rank(5)
+        except:
+            raise Exception("RNN : Input dimension should be rank 5 : [batch_size, n_steps, input_x, "
+                            "input_y, feature_map]")
+
+        fixed_batch_size = self.inputs.get_shape().with_rank_at_least(1)[0]
+
+        if fixed_batch_size.value:
+            batch_size = fixed_batch_size.value
+            print("     RNN batch_size (concurrent processes): %d" % batch_size)
+        else:
+            from tensorflow.python.ops import array_ops
+            batch_size = array_ops.shape(self.inputs)[0]
+            print("     non specified batch_size, uses a tensor instead.")
+        self.batch_size = batch_size
+
+
+        outputs = []
+        self.cell = cell = cell_fn(shape=cell_shape, filter_size=filter_size, num_features=feature_map)
+        if initial_state is None:
+            self.initial_state = cell.zero_state(batch_size, dtype=D_TYPE) # dtype=tf.float32)  # 1.2.3
+        state = self.initial_state
+        # with tf.variable_scope("model", reuse=None, initializer=initializer):
+        with tf.variable_scope(name, initializer=initializer) as vs:
+            for time_step in range(n_steps):
+                if time_step > 0: tf.get_variable_scope().reuse_variables()
+                (cell_output, state) = cell(self.inputs[:, time_step, :, :, :], state)
+                outputs.append(cell_output)
+
+            # Retrieve just the RNN variables.
+            # rnn_variables = [v for v in tf.all_variables() if v.name.startswith(vs.name)]
+            rnn_variables = tf.get_collection(tf.GraphKeys.VARIABLES, scope=vs.name)
+
+        print(" n_params : %d" % (len(rnn_variables)))
+
+        if return_last:
+            # 2D Tensor [batch_size, n_hidden]
+            self.outputs = outputs[-1]
+        else:
+            if return_seq_2d:
+                # PTB tutorial: stack dense layer after that, or compute the cost from the output
+                # 4D Tensor [n_example, h, w, c]
+                self.outputs = tf.reshape(tf.concat(outputs, 1), [-1, cell_shape[0] * cell_shape[1] * feature_map])
+            else:
+                # <akara>: stack more RNN layer after that
+                # 5D Tensor [n_example/n_steps, n_steps, h, w, c]
+                self.outputs = tf.reshape(tf.concat(outputs, 1), [-1, n_steps, cell_shape[0],
+                                                                  cell_shape[1], feature_map])
+
+        self.final_state = state
+
+        self.all_layers = list(layer.all_layers)
+        self.all_params = list(layer.all_params)
+        self.all_drop = dict(layer.all_drop)
+        self.all_layers.extend([self.outputs])
+        self.all_params.extend(rnn_variables)
+
+
+
 # Advanced Ops for Dynamic RNN
 def advanced_indexing_op(input, index):
     """Advanced Indexing for Sequences, returns the outputs by given sequence lengths.
@@ -4147,7 +5083,7 @@ def retrieve_seq_length_op(data):
     >>> data = [[[1,2],[2,2],[1,2],[1,2],[0,0]],
     ...         [[2,3],[2,4],[3,2],[0,0],[0,0]],
     ...         [[3,3],[2,2],[5,3],[1,2],[0,0]]]
-    >>> sl
+    >>> print(sl)
     ... [4 3 4]
 
     References
@@ -4186,7 +5122,6 @@ def retrieve_seq_length_op2(data):
     """
     return tf.reduce_sum(tf.cast(tf.greater(data, tf.zeros_like(data)), tf.int32), 1)
 
-
 def retrieve_seq_length_op3(data, pad_val=0): # HangSheng: return tensor for sequence length, if input is tf.string
     data_shape_size = data.get_shape().ndims
     if data_shape_size == 3:
@@ -4198,7 +5133,6 @@ def retrieve_seq_length_op3(data, pad_val=0): # HangSheng: return tensor for seq
     else:
         raise ValueError("retrieve_seq_length_op3: handling data_shape_size %s hasn't been implemented!" % (data_shape_size))
 
-
 def target_mask_op(data, pad_val=0):        # HangSheng: return tensor for mask,if input is tf.string
     data_shape_size = data.get_shape().ndims
     if data_shape_size == 3:
@@ -4224,7 +5158,7 @@ class DynamicRNNLayer(Layer):
         - see `RNN Cells in TensorFlow <https://www.tensorflow.org/api_docs/python/>`_
     cell_init_args : a dictionary
         The arguments for the cell initializer.
-    n_hidden : a int
+    n_hidden : an int
         The number of hidden units in the layer.
     initializer : initializer
         The initializer for initializing the parameters.
@@ -4237,7 +5171,7 @@ class DynamicRNNLayer(Layer):
         If None, initial_state is zero_state.
     dropout : `tuple` of `float`: (input_keep_prob, output_keep_prob).
         The input and output keep probability.
-    n_layer : a int, default is 1.
+    n_layer : an int, default is 1.
         The number of RNN layers.
     return_last : boolean
         - If True, return the last output, "Sequence input and single output"
@@ -4279,19 +5213,19 @@ class DynamicRNNLayer(Layer):
     Examples
     --------
     >>> input_seqs = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="input_seqs")
-    >>> network = tl.layers.EmbeddingInputlayer(
+    >>> net = tl.layers.EmbeddingInputlayer(
     ...             inputs = input_seqs,
     ...             vocabulary_size = vocab_size,
     ...             embedding_size = embedding_size,
     ...             name = 'seq_embedding')
-    >>> network = tl.layers.DynamicRNNLayer(network,
+    >>> net = tl.layers.DynamicRNNLayer(net,
     ...             cell_fn = tf.contrib.rnn.BasicLSTMCell, # for TF0.2 tf.nn.rnn_cell.BasicLSTMCell,
     ...             n_hidden = embedding_size,
     ...             dropout = 0.7,
     ...             sequence_length = tl.layers.retrieve_seq_length_op2(input_seqs),
     ...             return_seq_2d = True,     # stack denselayer or compute cost after it
     ...             name = 'dynamic_rnn')
-    ... network = tl.layers.DenseLayer(network, n_units=vocab_size,
+    ... net = tl.layers.DenseLayer(net, n_units=vocab_size,
     ...             act=tf.identity, name="output")
 
     References
@@ -4400,7 +5334,7 @@ def __init__(
 
         # Initialize initial_state
         if initial_state is None:
-            self.initial_state = self.cell.zero_state(batch_size, dtype=tf.float32)
+            self.initial_state = self.cell.zero_state(batch_size, dtype=D_TYPE) # dtype=tf.float32)
         else:
             self.initial_state = initial_state
 
@@ -4482,11 +5416,11 @@ class BiDynamicRNNLayer(Layer):
         - see `RNN Cells in TensorFlow <https://www.tensorflow.org/api_docs/python/>`_
     cell_init_args : a dictionary
         The arguments for the cell initializer.
-    n_hidden : a int
+    n_hidden : an int
         The number of hidden units in the layer.
     initializer : initializer
         The initializer for initializing the parameters.
-    sequence_length : a tensor, array or None
+    sequence_length : a tensor, array or None.
         The sequence length of each row of input data, see ``Advanced Ops for Dynamic RNN``.
             - If None, it uses ``retrieve_seq_length_op`` to compute the sequence_length, i.e. when the features of padding (on right hand side) are all zeros.
             - If using word embedding, you may need to compute the sequence_length from the ID array (the integer features before word embedding) by using ``retrieve_seq_length_op2`` or ``retrieve_seq_length_op``.
@@ -4498,7 +5432,7 @@ class BiDynamicRNNLayer(Layer):
         If None, initial_state is zero_state.
     dropout : `tuple` of `float`: (input_keep_prob, output_keep_prob).
         The input and output keep probability.
-    n_layer : a int, default is 1.
+    n_layer : an int, default is 1.
         The number of RNN layers.
     return_last : boolean
         If True, return the last output, "Sequence input and single output"\n
@@ -4511,7 +5445,7 @@ class BiDynamicRNNLayer(Layer):
     name : a string or None
         An optional name to attach to this layer.
 
-    Variables
+    Attributes
     -----------------------
     outputs : a tensor
         The output of this RNN.
@@ -4633,15 +5567,22 @@ def __init__(
                 # cell_instance_fn=lambda: MultiRNNCell_fn([cell_instance_fn2() for _ in range(n_layer)])
                 self.fw_cell = MultiRNNCell_fn([cell_creator() for _ in range(n_layer)])
                 self.bw_cell = MultiRNNCell_fn([cell_creator() for _ in range(n_layer)])
+
+            if dropout:
+                self.fw_cell = DropoutWrapper_fn(self.fw_cell,
+                          input_keep_prob=1.0, output_keep_prob=out_keep_prob)
+                self.bw_cell = DropoutWrapper_fn(self.bw_cell,
+                          input_keep_prob=1.0, output_keep_prob=out_keep_prob)
+
             # self.fw_cell=cell_instance_fn()
             # self.bw_cell=cell_instance_fn()
             # Initial state of RNN
             if fw_initial_state is None:
-                self.fw_initial_state = self.fw_cell.zero_state(self.batch_size, dtype=tf.float32)
+                self.fw_initial_state = self.fw_cell.zero_state(self.batch_size, dtype=D_TYPE) # dtype=tf.float32)
             else:
                 self.fw_initial_state = fw_initial_state
             if bw_initial_state is None:
-                self.bw_initial_state = self.bw_cell.zero_state(self.batch_size, dtype=tf.float32)
+                self.bw_initial_state = self.bw_cell.zero_state(self.batch_size, dtype=D_TYPE) # dtype=tf.float32)
             else:
                 self.bw_initial_state = bw_initial_state
             # Computes sequence_length
@@ -4672,6 +5613,7 @@ def __init__(
                 outputs = tf.concat(2, outputs)
             if return_last:
                 # [batch_size, 2 * n_hidden]
+                raise Exception("Do not support return_last at the moment")
                 self.outputs = advanced_indexing_op(outputs, sequence_length)
             else:
                 # [batch_size, n_step(max), 2 * n_hidden]
@@ -4709,10 +5651,12 @@ def __init__(
 # Seq2seq
 class Seq2Seq(Layer):
     """
-    The :class:`Seq2Seq` class is a simple :class:`DynamicRNNLayer` based Seq2seq layer,
-    both encoder and decoder are :class:`DynamicRNNLayer`, network details
-    see `Model <https://camo.githubusercontent.com/242210d7d0151cae91107ee63bff364a860db5dd/687474703a2f2f6936342e74696e797069632e636f6d2f333031333674652e706e67>`_
-    and `Sequence to Sequence Learning with Neural Networks <https://arxiv.org/abs/1409.3215>`_ .
+    The :class:`Seq2Seq` class is a Simple :class:`DynamicRNNLayer` based Seq2seq layer without using `tl.contrib.seq2seq <https://www.tensorflow.org/api_guides/python/contrib.seq2seq>`_.
+    See `Model <https://camo.githubusercontent.com/9e88497fcdec5a9c716e0de5bc4b6d1793c6e23f/687474703a2f2f73757269796164656570616e2e6769746875622e696f2f696d672f736571327365712f73657132736571322e706e67>`_
+    and `Sequence to Sequence Learning with Neural Networks <https://arxiv.org/abs/1409.3215>`_.
+
+    - Please check the example `Chatbot in 200 lines of code <https://github.com/zsdonghao/seq2seq-chatbot>`_.
+    - The Author recommends users to read the source code of :class:`DynamicRNNLayer` and :class:`Seq2Seq`.
 
     Parameters
     ----------
@@ -4724,17 +5668,19 @@ class Seq2Seq(Layer):
         - see `RNN Cells in TensorFlow <https://www.tensorflow.org/api_docs/python/>`_
     cell_init_args : a dictionary
         The arguments for the cell initializer.
-    n_hidden : a int
+    n_hidden : an int
         The number of hidden units in the layer.
     initializer : initializer
         The initializer for initializing the parameters.
     encode_sequence_length : tensor for encoder sequence length, see :class:`DynamicRNNLayer` .
     decode_sequence_length : tensor for decoder sequence length, see :class:`DynamicRNNLayer` .
-    initial_state : None or forward RNN State
-        If None, initial_state is of encoder zero_state.
+    initial_state_encode : None or RNN state (from placeholder or other RNN).
+        If None, initial_state_encode is of zero state.
+    initial_state_decode : None or RNN state (from placeholder or other RNN).
+        If None, initial_state_decode is of the final state of the RNN encoder.
     dropout : `tuple` of `float`: (input_keep_prob, output_keep_prob).
         The input and output keep probability.
-    n_layer : a int, default is 1.
+    n_layer : an int, default is 1.
         The number of RNN layers.
     return_seq_2d : boolean
         - When return_last = False
@@ -4743,13 +5689,27 @@ class Seq2Seq(Layer):
     name : a string or None
         An optional name to attach to this layer.
 
-    Variables
+    Attributes
     ------------
     outputs : a tensor
         The output of RNN decoder.
+    initial_state_encode : a tensor or StateTuple
+        Initial state of RNN encoder.
+    initial_state_decode : a tensor or StateTuple
+        Initial state of RNN decoder.
+    final_state_encode : a tensor or StateTuple
+        Final state of RNN encoder.
+    final_state_decode : a tensor or StateTuple
+        Final state of RNN decoder.
 
-    final_state : a tensor or StateTuple
-        Final state of decoder, see :class:`DynamicRNNLayer` .
+    Notes
+    --------
+    - How to feed data: `Sequence to Sequence Learning with Neural Networks <https://arxiv.org/pdf/1409.3215v3.pdf>`_
+    - input_seqs : ``['how', 'are', 'you', '<PAD_ID'>]``
+    - decode_seqs : ``['<START_ID>', 'I', 'am', 'fine', '<PAD_ID'>]``
+    - target_seqs : ``['I', 'am', 'fine', '<END_ID', '<PAD_ID'>]``
+    - target_mask : ``[1, 1, 1, 1, 0]``
+    - related functions : tl.prepro <pad_sequences, precess_sequences, sequences_add_start_id, sequences_get_mask>
 
     Examples
     ----------
@@ -4781,7 +5741,7 @@ class Seq2Seq(Layer):
     ...             initializer = tf.random_uniform_initializer(-0.1, 0.1),
     ...             encode_sequence_length = retrieve_seq_length_op2(encode_seqs),
     ...             decode_sequence_length = retrieve_seq_length_op2(decode_seqs),
-    ...             initial_state = None,
+    ...             initial_state_encode = None,
     ...             dropout = None,
     ...             n_layer = 1,
     ...             return_seq_2d = True,
@@ -4791,14 +5751,7 @@ class Seq2Seq(Layer):
     >>> y = tf.nn.softmax(net_out.outputs)
     >>> net_out.print_params(False)
 
-    Notes
-    --------
-    - How to feed data: `Sequence to Sequence Learning with Neural Networks <https://arxiv.org/pdf/1409.3215v3.pdf>`_
-    - input_seqs : ``['how', 'are', 'you', '<PAD_ID'>]``
-    - decode_seqs : ``['<START_ID>', 'I', 'am', 'fine', '<PAD_ID'>]``
-    - target_seqs : ``['I', 'am', 'fine', '<END_ID']``
-    - target_mask : ``[1, 1, 1, 1, 0]``
-    - related functions : tl.prepro <pad_sequences, precess_sequences, sequences_add_start_id, sequences_get_mask>
+
     """
     def __init__(
         self,
@@ -4810,7 +5763,8 @@ def __init__(
         initializer = tf.random_uniform_initializer(-0.1, 0.1),
         encode_sequence_length = None,
         decode_sequence_length = None,
-        initial_state = None,
+        initial_state_encode = None,
+        initial_state_decode = None,
         dropout = None,
         n_layer = 1,
         # return_last = False,
@@ -4836,7 +5790,7 @@ def __init__(
                      cell_fn = cell_fn,
                      cell_init_args = cell_init_args,
                      n_hidden = n_hidden,
-                     initial_state = initial_state,
+                     initial_state = initial_state_encode,
                      dropout = dropout,
                      n_layer = n_layer,
                      sequence_length = encode_sequence_length,
@@ -4849,7 +5803,7 @@ def __init__(
                      cell_fn = cell_fn,
                      cell_init_args = cell_init_args,
                      n_hidden = n_hidden,
-                     initial_state = network_encode.final_state,
+                     initial_state = (network_encode.final_state if initial_state_decode is None else initial_state_decode),
                      dropout = dropout,
                      n_layer = n_layer,
                      sequence_length = decode_sequence_length,
@@ -4858,18 +5812,27 @@ def __init__(
                      name = name+'_decode')
             self.outputs = network_decode.outputs
 
-            rnn_variables = tf.get_collection(TF_GRAPHKEYS_VARIABLES, scope=vs.name)
+            # rnn_variables = tf.get_collection(TF_GRAPHKEYS_VARIABLES, scope=vs.name)
+
+        # Initial state
+        self.initial_state_encode = network_encode.initial_state
+        self.initial_state_decode = network_decode.initial_state
 
         # Final state
-        self.final_state = network_decode.final_state
+        self.final_state_encode = network_encode.final_state
+        self.final_state_decode = network_decode.final_state
 
         # self.sequence_length = sequence_length
-        self.all_layers = list(network_decode.all_layers)
-        self.all_params = list(network_decode.all_params)
-        self.all_drop = dict(network_decode.all_drop)
+        self.all_layers = list(network_encode.all_layers)
+        self.all_params = list(network_encode.all_params)
+        self.all_drop = dict(network_encode.all_drop)
+
+        self.all_layers.extend(list(network_decode.all_layers))
+        self.all_params.extend(list(network_decode.all_params))
+        self.all_drop.update(dict(network_decode.all_drop))
 
         self.all_layers.extend( [self.outputs] )
-        self.all_params.extend( rnn_variables )
+        # self.all_params.extend( rnn_variables )
 
         self.all_layers = list_remove_repeat(self.all_layers)
         self.all_params = list_remove_repeat(self.all_params)
@@ -4953,20 +5916,20 @@ class FlattenLayer(Layer):
     Examples
     --------
     >>> x = tf.placeholder(tf.float32, shape=[None, 28, 28, 1])
-    >>> network = tl.layers.InputLayer(x, name='input_layer')
-    >>> network = tl.layers.Conv2dLayer(network,
+    >>> net = tl.layers.InputLayer(x, name='input_layer')
+    >>> net = tl.layers.Conv2dLayer(net,
     ...                    act = tf.nn.relu,
     ...                    shape = [5, 5, 32, 64],
     ...                    strides=[1, 1, 1, 1],
     ...                    padding='SAME',
     ...                    name ='cnn_layer')
-    >>> network = tl.layers.Pool2dLayer(network,
+    >>> net = tl.layers.Pool2dLayer(net,
     ...                    ksize=[1, 2, 2, 1],
     ...                    strides=[1, 2, 2, 1],
     ...                    padding='SAME',
     ...                    pool = tf.nn.max_pool,
     ...                    name ='pool_layer',)
-    >>> network = tl.layers.FlattenLayer(network, name='flatten_layer')
+    >>> net = tl.layers.FlattenLayer(net, name='flatten_layer')
     """
     def __init__(
         self,
@@ -5026,6 +5989,39 @@ def __init__(
         self.all_drop = dict(layer.all_drop)
         self.all_layers.extend( [self.outputs] )
 
+class TransposeLayer(Layer):
+    """
+    The :class:`TransposeLayer` class transpose the dimension of a teneor, see `tf.transpose() <https://www.tensorflow.org/api_docs/python/tf/transpose>`_ .
+
+    Parameters
+    ----------
+    layer : a :class:`Layer` instance
+        The `Layer` class feeding into this layer.
+    perm: list, a permutation of the dimensions
+        Similar with numpy.transpose.
+    name : a string or None
+        An optional name to attach to this layer.
+    """
+    def __init__(
+        self,
+        layer = None,
+        perm = None,
+        name = 'transpose',
+    ):
+        Layer.__init__(self, name=name)
+        self.inputs = layer.outputs
+        assert perm is not None
+
+        print("  [TL] TransposeLayer  %s: perm:%s" % (self.name, perm))
+        # with tf.variable_scope(name) as vs:
+        self.outputs = tf.transpose(self.inputs, perm=perm, name=name)
+        self.all_layers = list(layer.all_layers)
+        self.all_params = list(layer.all_params)
+        self.all_drop = dict(layer.all_drop)
+        self.all_layers.extend( [self.outputs] )
+        # self.all_params.extend( variables )
+
+## Lambda
 class LambdaLayer(Layer):
     """
     The :class:`LambdaLayer` class is a layer which is able to use the provided function.
@@ -5044,9 +6040,9 @@ class LambdaLayer(Layer):
     Examples
     ---------
     >>> x = tf.placeholder(tf.float32, shape=[None, 1], name='x')
-    >>> network = tl.layers.InputLayer(x, name='input_layer')
-    >>> network = LambdaLayer(network, lambda x: 2*x, name='lambda_layer')
-    >>> y = network.outputs
+    >>> net = tl.layers.InputLayer(x, name='input_layer')
+    >>> net = LambdaLayer(net, lambda x: 2*x, name='lambda_layer')
+    >>> y = net.outputs
     >>> sess = tf.InteractiveSession()
     >>> out = sess.run(y, feed_dict={x : [[1],[2]]})
     ... [[2],[4]]
@@ -5076,8 +6072,7 @@ def __init__(
 ## Merge layer
 class ConcatLayer(Layer):
     """
-    The :class:`ConcatLayer` class is layer which concat (merge) two or more
-    :class:`DenseLayer` to a single class:`DenseLayer`.
+    The :class:`ConcatLayer` class is layer which concat (merge) two or more tensor by given axis..
 
     Parameters
     ----------
@@ -5089,29 +6084,27 @@ class ConcatLayer(Layer):
         An optional name to attach to this layer.
 
     Examples
-    --------
+    ----------
     >>> sess = tf.InteractiveSession()
     >>> x = tf.placeholder(tf.float32, shape=[None, 784])
     >>> inputs = tl.layers.InputLayer(x, name='input_layer')
-    >>> net1 = tl.layers.DenseLayer(inputs, n_units=800, act = tf.nn.relu, name='relu1_1')
-    >>> net2 = tl.layers.DenseLayer(inputs, n_units=300, act = tf.nn.relu, name='relu2_1')
-    >>> network = tl.layers.ConcatLayer(layer = [net1, net2], name ='concat_layer')
+    >>> net1 = tl.layers.DenseLayer(inputs, 800, act=tf.nn.relu, name='relu1_1')
+    >>> net2 = tl.layers.DenseLayer(inputs, 300, act=tf.nn.relu, name='relu2_1')
+    >>> net = tl.layers.ConcatLayer([net1, net2], 1, name ='concat_layer')
     ...     [TL] InputLayer input_layer (?, 784)
-    ...     [TL] DenseLayer relu1_1: 800, <function relu at 0x1108e41e0>
-    ...     [TL] DenseLayer relu2_1: 300, <function relu at 0x1108e41e0>
+    ...     [TL] DenseLayer relu1_1: 800, relu
+    ...     [TL] DenseLayer relu2_1: 300, relu
     ...     [TL] ConcatLayer concat_layer, 1100
-    ...
     >>> tl.layers.initialize_global_variables(sess)
-    >>> network.print_params()
+    >>> net.print_params()
     ...     param 0: (784, 800) (mean: 0.000021, median: -0.000020 std: 0.035525)
-    ...     param 1: (800,) (mean: 0.000000, median: 0.000000 std: 0.000000)
+    ...     param 1: (800,)     (mean: 0.000000, median: 0.000000  std: 0.000000)
     ...     param 2: (784, 300) (mean: 0.000000, median: -0.000048 std: 0.042947)
-    ...     param 3: (300,) (mean: 0.000000, median: 0.000000 std: 0.000000)
+    ...     param 3: (300,)     (mean: 0.000000, median: 0.000000  std: 0.000000)
     ...     num of params: 863500
-    >>> network.print_layers()
-    ...     layer 0: Tensor("Relu:0", shape=(?, 800), dtype=float32)
+    >>> net.print_layers()
+    ...     layer 0: ("Relu:0", shape=(?, 800), dtype=float32)
     ...     layer 1: Tensor("Relu_1:0", shape=(?, 300), dtype=float32)
-    ...
     """
     def __init__(
         self,
@@ -5127,8 +6120,8 @@ def __init__(
             self.outputs = tf.concat(self.inputs, concat_dim, name=name)
         except: # TF0.12
             self.outputs = tf.concat(concat_dim, self.inputs, name=name)
-        self.n_units = int(self.outputs.get_shape()[-1])
-        print("  [TL] ConcatLayer %s: %d" % (self.name, self.n_units))
+
+        print("  [TL] ConcatLayer %s: axis: %d" % (self.name, concat_dim))
 
         self.all_layers = list(layer[0].all_layers)
         self.all_params = list(layer[0].all_params)
@@ -5197,7 +6190,7 @@ def __init__(
         self.all_params = list_remove_repeat(self.all_params)
         # self.all_drop = list_remove_repeat(self.all_drop)
 
-# Extend
+## Extend
 class ExpandDimsLayer(Layer):
     """
     The :class:`ExpandDimsLayer` class inserts a dimension of 1 into a tensor's shape,
@@ -5265,12 +6258,102 @@ def __init__(
         self.all_layers.extend( [self.outputs] )
         # self.all_params.extend( variables )
 
+## Stack Unstack
+class StackLayer(Layer):
+    """
+    The :class:`StackLayer` class is layer for stacking a list of rank-R tensors into one rank-(R+1) tensor, see `tf.stack() <https://www.tensorflow.org/api_docs/python/tf/stack>`_.
+
+    Parameters
+    ----------
+    layer : a list of :class:`Layer` instances
+        The `Layer` class feeding into this layer.
+    axis : an int
+        Dimension along which to concatenate.
+    name : a string or None
+        An optional name to attach to this layer.
+    """
+    def __init__(
+        self,
+        layer = [],
+        axis = 0,
+        name ='stack',
+    ):
+        Layer.__init__(self, name=name)
+        self.inputs = []
+        for l in layer:
+            self.inputs.append(l.outputs)
+
+        self.outputs = tf.stack(self.inputs, axis=axis, name=name)
+
+        print("  [TL] StackLayer %s: axis: %d" % (self.name, axis))
+
+        self.all_layers = list(layer[0].all_layers)
+        self.all_params = list(layer[0].all_params)
+        self.all_drop = dict(layer[0].all_drop)
+
+        for i in range(1, len(layer)):
+            self.all_layers.extend(list(layer[i].all_layers))
+            self.all_params.extend(list(layer[i].all_params))
+            self.all_drop.update(dict(layer[i].all_drop))
+
+        self.all_layers = list_remove_repeat(self.all_layers)
+        self.all_params = list_remove_repeat(self.all_params)
+
+def UnStackLayer(
+        layer = None,
+        num = None,
+        axis = 0,
+        name ='unstack',):
+    """
+    The :class:`UnStackLayer` is layer for unstacking the given dimension of a rank-R tensor into rank-(R-1) tensors., see `tf.unstack() <https://www.tensorflow.org/api_docs/python/tf/unstack>`_.
+
+    Parameters
+    ----------
+    layer : a list of :class:`Layer` instances
+        The `Layer` class feeding into this layer.
+    num : an int
+        The length of the dimension axis. Automatically inferred if None (the default).
+    axis : an int
+        Dimension along which to concatenate.
+    name : a string or None
+        An optional name to attach to this layer.
+
+    Returns
+    --------
+    The list of layer objects unstacked from the input.
+    """
+    inputs = layer.outputs
+    with tf.variable_scope(name) as vs:
+        outputs = tf.unstack(inputs, num=num, axis=axis)
+
+    print("  [TL] UnStackLayer %s: num: %s axis: %d, n_outputs: %d" % (name, num, axis, len(outputs)))
+
+    net_new = []
+    scope_name = tf.get_variable_scope().name
+    if scope_name:
+        whole_name = scope_name + '/' + name
+    else:
+        whole_name = name
+
+    for i in range(len(outputs)):
+        n = Layer(None, name=whole_name+str(i))
+        n.outputs = outputs[i]
+        n.all_layers = list(layer.all_layers)
+        n.all_params = list(layer.all_params)
+        n.all_drop = dict(layer.all_drop)
+        n.all_layers.extend( [inputs] )
+
+        net_new.append(n)
+
+    return net_new
+
 ## TF-Slim layer
 class SlimNetsLayer(Layer):
     """
     The :class:`SlimNetsLayer` class can be used to merge all TF-Slim nets into
-    TensorLayer. Model can be found in `slim-model <https://github.com/tensorflow/models/tree/master/slim#pre-trained-models>`_ , more about slim
-    see `slim-git <https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/slim>`_ .
+    TensorLayer. Models can be found in `slim-model <https://github.com/tensorflow/models/tree/master/research/slim#pre-trained-models>`_,
+    see Inception V3 example on `Github <https://github.com/zsdonghao/tensorlayer/blob/master/example/tutorial_inceptionV3_tfslim.py>`_.
+
 
     Parameters
     ----------
@@ -5283,10 +6366,6 @@ class SlimNetsLayer(Layer):
     name : a string or None
         An optional name to attach to this layer.
 
-    Examples
-    --------
-    - see Inception V3 example on `Github <https://github.com/zsdonghao/tensorlayer>`_
-
     Notes
     -----
     The due to TF-Slim stores the layers as dictionary, the ``all_layers`` in this
@@ -5334,7 +6413,8 @@ def __init__(
 class KerasLayer(Layer):
     """
     The :class:`KerasLayer` class can be used to merge all Keras layers into
-    TensorLayer. Example can be found here `tutorial_keras.py <https://github.com/zsdonghao/tensorlayer/blob/master/example/tutorial_keras.py>`_
+    TensorLayer. Example can be found here `tutorial_keras.py <https://github.com/zsdonghao/tensorlayer/blob/master/example/tutorial_keras.py>`_.
+    This layer will be deprecated soon as :class:`LambdaLayer` can do the same thing.
 
     Parameters
     ----------
@@ -5372,7 +6452,8 @@ def __init__(
 class EstimatorLayer(Layer):
     """
     The :class:`EstimatorLayer` class accepts ``model_fn`` that described the model.
-    It is similar with :class:`KerasLayer`, see `tutorial_keras.py <https://github.com/zsdonghao/tensorlayer/blob/master/example/tutorial_keras.py>`_
+    It is similar with :class:`KerasLayer`, see `tutorial_keras.py <https://github.com/zsdonghao/tensorlayer/blob/master/example/tutorial_keras.py>`_.
+    This layer will be deprecated soon as :class:`LambdaLayer` can do the same thing.
 
     Parameters
     ----------
@@ -5445,7 +6526,7 @@ def __init__(
 
         # with tf.name_scope(name) as scope:
         with tf.variable_scope(name) as vs:
-            alphas = tf.get_variable(name='alphas', shape=w_shape, initializer=a_init, **a_init_args )
+            alphas = tf.get_variable(name='alphas', shape=w_shape, initializer=a_init, dtype=D_TYPE, **a_init_args )
             try:  ## TF 1.0
                 self.outputs = tf.nn.relu(self.inputs) + tf.multiply(alphas, (self.inputs - tf.abs(self.inputs))) * 0.5
             except: ## TF 0.12
@@ -5556,7 +6637,7 @@ def __init__(self,
 #     ----------
 #     layer : a list of :class:`Layer` instances
 #         The `Layer` class feeding into this layer.
-#     n_outputs : a int
+#     n_outputs : an int
 #         The number of output
 #     name : a string or None
 #         An optional name to attach to this layer.
@@ -5653,9 +6734,9 @@ def __init__(self,
         softmax_loss_function = None
         # Sampled softmax only makes sense if we sample less than vocabulary size.
         if num_samples > 0 and num_samples < self.target_vocab_size:
-          w = tf.get_variable("proj_w", [size, self.target_vocab_size])
+          w = tf.get_variable("proj_w", [size, self.target_vocab_size], dtype=D_TYPE)
           w_t = tf.transpose(w)
-          b = tf.get_variable("proj_b", [self.target_vocab_size])
+          b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=D_TYPE)
           output_projection = (w, b)
 
           def sampled_loss(inputs, labels):
@@ -5680,9 +6761,9 @@ def sampled_loss(inputs, labels):
         cell = cell_creator()
         if num_layers > 1:
           try: # TF1.0
-            cell = tf.contrib.rnn.MultiRNNCell([single_cell] * num_layers)
+            cell = tf.contrib.rnn.MultiRNNCell([cell] * num_layers)
           except:
-            cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers)
+            cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers)
 
         # ============== Seq Decode Layer ============
         # The seq2seq function: we use embedding for the input and attention.
@@ -5895,42 +6976,43 @@ def get_batch(self, data, bucket_id, PAD_ID=0, GO_ID=1, EOS_ID=2, UNK_ID=3):
     return batch_encoder_inputs, batch_decoder_inputs, batch_weights
 
 ## Developing or Untested
-class MaxoutLayer(Layer):
-    """
-    Waiting for contribution
-
-    Single DenseLayer with Max-out behaviour, work well with Dropout.
-
-    References
-    -----------
-    `Goodfellow (2013) Maxout Networks <http://arxiv.org/abs/1302.4389>`_
-    """
-    def __init__(
-        self,
-        layer = None,
-        n_units = 100,
-        name ='maxout_layer',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = layer.outputs
-
-        print("  [TL] MaxoutLayer %s: %d" % (self.name, self.n_units))
-        print("    Waiting for contribution")
-        with tf.variable_scope(name) as vs:
-            pass
-            # W = tf.Variable(init.xavier_init(n_inputs=n_in, n_outputs=n_units, uniform=True), name='W')
-            # b = tf.Variable(tf.zeros([n_units]), name='b')
-
-        # self.outputs = act(tf.matmul(self.inputs, W) + b)
-        # https://www.tensorflow.org/versions/r0.9/api_docs/python/array_ops.html#pack
-        # http://stackoverflow.com/questions/34362193/how-to-explicitly-broadcast-a-tensor-to-match-anothers-shape-in-tensorflow
-        # tf.concat tf.pack  tf.tile
+# class MaxoutLayer(Layer):
+#     """
+#     Waiting for contribution
+#
+#     Single DenseLayer with Max-out behaviour, work well with Dropout.
+#
+#     References
+#     -----------
+#     `Goodfellow (2013) Maxout Networks <http://arxiv.org/abs/1302.4389>`_
+#     """
+#     def __init__(
+#         self,
+#         layer = None,
+#         n_units = 100,
+#         name ='maxout_layer',
+#     ):
+#         Layer.__init__(self, name=name)
+#         self.inputs = layer.outputs
+#
+#         print("  [TL] MaxoutLayer %s: %d" % (self.name, self.n_units))
+#         print("    Waiting for contribution")
+#         with tf.variable_scope(name) as vs:
+#             pass
+#             # W = tf.Variable(init.xavier_init(n_inputs=n_in, n_outputs=n_units, uniform=True), name='W')
+#             # b = tf.Variable(tf.zeros([n_units]), name='b')
+#
+#         # self.outputs = act(tf.matmul(self.inputs, W) + b)
+#         # https://www.tensorflow.org/versions/r0.9/api_docs/python/array_ops.html#pack
+#         # http://stackoverflow.com/questions/34362193/how-to-explicitly-broadcast-a-tensor-to-match-anothers-shape-in-tensorflow
+#         # tf.concat tf.pack  tf.tile
+#
+#         self.all_layers = list(layer.all_layers)
+#         self.all_params = list(layer.all_params)
+#         self.all_drop = dict(layer.all_drop)
+#         self.all_layers.extend( [self.outputs] )
+#         self.all_params.extend( [W, b] )
 
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-        self.all_params.extend( [W, b] )
 
 
 
diff --git a/tensorlayer/nlp.py b/tensorlayer/nlp.py
index 13b1e55f..fe1273bd 100644
--- a/tensorlayer/nlp.py
+++ b/tensorlayer/nlp.py
@@ -1,11 +1,9 @@
 #! /usr/bin/python
-# -*- coding: utf8 -*-
-
-
-
+# -*- coding: utf-8 -*-
 
 import tensorflow as tf
 import os
+import re
 from sys import platform as _platform
 import collections
 import random
@@ -15,7 +13,9 @@
 from tensorflow.python.platform import gfile
 import re
 
-## Iteration functions
+# Iteration functions
+
+
 def generate_skip_gram_batch(data, batch_size, num_skips, skip_window, data_index=0):
     """Generate a training batch for the Skip-Gram model.
 
@@ -44,9 +44,8 @@ def generate_skip_gram_batch(data, batch_size, num_skips, skip_window, data_inde
 
     Examples
     --------
-    >>> Setting num_skips=2, skip_window=1, use the right and left words.
-    >>> In the same way, num_skips=4, skip_window=2 means use the nearby 4 words.
-
+    - Setting num_skips=2, skip_window=1, use the right and left words.
+     In the same way, num_skips=4, skip_window=2 means use the nearby 4 words.
     >>> data = [1,2,3,4,5,6,7,8,9,10,11]
     >>> batch, labels, data_index = tl.nlp.generate_skip_gram_batch(data=data, batch_size=8, num_skips=2, skip_window=1, data_index=0)
     >>> print(batch)
@@ -72,14 +71,14 @@ def generate_skip_gram_batch(data, batch_size, num_skips, skip_window, data_inde
     assert num_skips <= 2 * skip_window
     batch = np.ndarray(shape=(batch_size), dtype=np.int32)
     labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
-    span = 2 * skip_window + 1 # [ skip_window target skip_window ]
+    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
     buffer = collections.deque(maxlen=span)
     for _ in range(span):
         buffer.append(data[data_index])
         data_index = (data_index + 1) % len(data)
     for i in range(batch_size // num_skips):
         target = skip_window  # target label at the center of the buffer
-        targets_to_avoid = [ skip_window ]
+        targets_to_avoid = [skip_window]
         for j in range(num_skips):
             while target in targets_to_avoid:
                 target = random.randint(0, span - 1)
@@ -91,7 +90,7 @@ def generate_skip_gram_batch(data, batch_size, num_skips, skip_window, data_inde
     return batch, labels, data_index
 
 
-## Sampling functions
+# Sampling functions
 def sample(a=[], temperature=1.0):
     """Sample an index from a probability array.
 
@@ -109,10 +108,9 @@ def sample(a=[], temperature=1.0):
 
     Notes
     ------
-    No matter what is the temperature and input list, the sum of all probabilities will be one.
+    - No matter what is the temperature and input list, the sum of all probabilities will be one.
     Even if input list = [1, 100, 200], the sum of all probabilities will still be one.
-
-    For large vocabulary_size, choice a higher temperature to avoid error.
+    - For large vocabulary_size, choice a higher temperature to avoid error.
     """
     b = np.copy(a)
     try:
@@ -138,6 +136,7 @@ def sample(a=[], temperature=1.0):
         # print(b)
         return np.argmax(np.random.multinomial(1, b, 1))
 
+
 def sample_top(a=[], top_k=10):
     """Sample from ``top_k`` probabilities.
 
@@ -154,7 +153,7 @@ def sample_top(a=[], top_k=10):
     probs = probs / np.sum(probs)
     choice = np.random.choice(idx, p=probs)
     return choice
-    ## old implementation
+    # old implementation
     # a = np.array(a)
     # idx = np.argsort(a)[::-1]
     # idx = idx[:top_k]
@@ -166,117 +165,125 @@ def sample_top(a=[], top_k=10):
     # # return choice
 
 
-## Vector representations of words (Advanced)  UNDOCUMENT
+# Vector representations of words (Advanced)  UNDOCUMENT
 class SimpleVocabulary(object):
-  """Simple vocabulary wrapper, see create_vocab().
+    """Simple vocabulary wrapper, see create_vocab().
 
-  Parameters
-  ------------
-  vocab : A dictionary of word to word_id.
-  unk_id : Id of the special 'unknown' word.
-  """
+    Parameters
+    ------------
+    vocab : A dictionary of word to word_id.
+    unk_id : Id of the special 'unknown' word.
+    """
 
-  def __init__(self, vocab, unk_id):
-    """Initializes the vocabulary."""
+    def __init__(self, vocab, unk_id):
+        """Initializes the vocabulary."""
 
+        self._vocab = vocab
+        self._unk_id = unk_id
 
-    self._vocab = vocab
-    self._unk_id = unk_id
+    def word_to_id(self, word):
+        """Returns the integer id of a word string."""
+        if word in self._vocab:
+            return self._vocab[word]
+        else:
+            return self._unk_id
 
-  def word_to_id(self, word):
-    """Returns the integer id of a word string."""
-    if word in self._vocab:
-      return self._vocab[word]
-    else:
-      return self._unk_id
 
 class Vocabulary(object):
-  """Create Vocabulary class from a given vocabulary and its id-word, word-id convert,
-  see create_vocab() and ``tutorial_tfrecord3.py``.
-
-  Parameters
-  -----------
-  vocab_file : File containing the vocabulary, where the words are the first
-        whitespace-separated token on each line (other tokens are ignored) and
-        the word ids are the corresponding line numbers.
-  start_word : Special word denoting sentence start.
-  end_word : Special word denoting sentence end.
-  unk_word : Special word denoting unknown words.
-
-  Properties
-  ------------
-  vocab : a dictionary from word to id.
-  reverse_vocab : a list from id to word.
-  start_id : int of start id
-  end_id : int of end id
-  unk_id : int of unk id
-  pad_id : int of padding id
-
-  Vocab_files
-  -------------
-  >>> Look as follow, includes `start_word` , `end_word` but no `unk_word` .
-  >>> a 969108
-  >>> <S> 586368
-  >>> </S> 586368
-  >>> . 440479
-  >>> on 213612
-  >>> of 202290
-  >>> the 196219
-  >>> in 182598
-  >>> with 152984
-  >>> and 139109
-  >>> is 97322
-  """
-
-  def __init__(self,
-               vocab_file,
-               start_word="<S>",
-               end_word="</S>",
-               unk_word="<UNK>",
-               pad_word="<PAD>"):
-    if not tf.gfile.Exists(vocab_file):
-      tf.logging.fatal("Vocab file %s not found.", vocab_file)
-    tf.logging.info("Initializing vocabulary from file: %s", vocab_file)
-
-    with tf.gfile.GFile(vocab_file, mode="r") as f:
-      reverse_vocab = list(f.readlines())
-    reverse_vocab = [line.split()[0] for line in reverse_vocab]
-    assert start_word in reverse_vocab
-    assert end_word in reverse_vocab
-    if unk_word not in reverse_vocab:
-      reverse_vocab.append(unk_word)
-    vocab = dict([(x, y) for (y, x) in enumerate(reverse_vocab)])
-
-    print("  [TL] Vocabulary from %s : %s %s %s" % (vocab_file, start_word, end_word, unk_word))
-    print("    vocabulary with %d words (includes start_word, end_word, unk_word)" % len(vocab))
-    # tf.logging.info("     vocabulary with %d words" % len(vocab))
-
-    self.vocab = vocab  # vocab[word] = id
-    self.reverse_vocab = reverse_vocab  # reverse_vocab[id] = word
-
-    # Save special word ids.
-    self.start_id = vocab[start_word]
-    self.end_id = vocab[end_word]
-    self.unk_id = vocab[unk_word]
-    self.pad_id = vocab[pad_word]
-    print("      start_id: %d" % self.start_id)
-    print("      end_id: %d" % self.end_id)
-    print("      unk_id: %d" % self.unk_id)
-    print("      pad_id: %d" % self.pad_id)
-
-  def word_to_id(self, word):
-    """Returns the integer word id of a word string."""
-    if word in self.vocab:
-      return self.vocab[word]
-    else:
-      return self.unk_id
+    """Create Vocabulary class from a given vocabulary and its id-word, word-id convert,
+    see create_vocab() and ``tutorial_tfrecord3.py``.
+
+    Parameters
+    -----------
+    vocab_file : File containing the vocabulary, where the words are the first
+          whitespace-separated token on each line (other tokens are ignored) and
+          the word ids are the corresponding line numbers.
+    start_word : Special word denoting sentence start.
+    end_word : Special word denoting sentence end.
+    unk_word : Special word denoting unknown words.
+
+    Attributes
+    ------------
+    vocab : a dictionary from word to id.
+    reverse_vocab : a list from id to word.
+    start_id : int of start id
+    end_id : int of end id
+    unk_id : int of unk id
+    pad_id : int of padding id
+
+    Vocab_files
+    -------------
+    >>> Look as follow, includes `start_word` , `end_word` but no `unk_word` .
+    >>> a 969108
+    >>> <S> 586368
+    >>> </S> 586368
+    >>> . 440479
+    >>> on 213612
+    >>> of 202290
+    >>> the 196219
+    >>> in 182598
+    >>> with 152984
+    >>> and 139109
+    >>> is 97322
+    """
+
+    def __init__(self,
+                 vocab_file,
+                 start_word="<S>",
+                 end_word="</S>",
+                 unk_word="<UNK>",
+                 pad_word="<PAD>"):
+        if not tf.gfile.Exists(vocab_file):
+            tf.logging.fatal("Vocab file %s not found.", vocab_file)
+        tf.logging.info("Initializing vocabulary from file: %s", vocab_file)
+
+        with tf.gfile.GFile(vocab_file, mode="r") as f:
+            reverse_vocab = list(f.readlines())
+        reverse_vocab = [line.split()[0] for line in reverse_vocab]
+        # assert start_word in reverse_vocab
+        # assert end_word in reverse_vocab
+        if start_word not in reverse_vocab:  # haodong
+            reverse_vocab.append(start_word)
+        if end_word not in reverse_vocab:
+            reverse_vocab.append(end_word)
+        if unk_word not in reverse_vocab:
+            reverse_vocab.append(unk_word)
+        if pad_word not in reverse_vocab:
+            reverse_vocab.append(pad_word)
+
+        vocab = dict([(x, y) for (y, x) in enumerate(reverse_vocab)])
+
+        print("  [TL] Vocabulary from %s : %s %s %s" % (vocab_file, start_word, end_word, unk_word))
+        print("    vocabulary with %d words (includes start_word, end_word, unk_word)" % len(vocab))
+        # tf.logging.info("     vocabulary with %d words" % len(vocab))
+
+        self.vocab = vocab  # vocab[word] = id
+        self.reverse_vocab = reverse_vocab  # reverse_vocab[id] = word
+
+        # Save special word ids.
+        self.start_id = vocab[start_word]
+        self.end_id = vocab[end_word]
+        self.unk_id = vocab[unk_word]
+        self.pad_id = vocab[pad_word]
+        print("      start_id: %d" % self.start_id)
+        print("      end_id: %d" % self.end_id)
+        print("      unk_id: %d" % self.unk_id)
+        print("      pad_id: %d" % self.pad_id)
+
+    def word_to_id(self, word):
+        """Returns the integer word id of a word string."""
+        if word in self.vocab:
+            return self.vocab[word]
+        else:
+            return self.unk_id
+
+    def id_to_word(self, word_id):
+        """Returns the word string of an integer word id."""
+        if word_id >= len(self.reverse_vocab):
+            return self.reverse_vocab[self.unk_id]
+        else:
+            return self.reverse_vocab[word_id]
 
-  def id_to_word(self, word_id):
-    """Returns the word string of an integer word id."""
-    if word_id >= len(self.reverse_vocab):
-      return self.reverse_vocab[self.unk_id]
-    else:
-      return self.reverse_vocab[word_id]
 
 def process_sentence(sentence, start_word="<S>", end_word="</S>"):
     """Converts a sentence string into a list of string words, add start_word and end_word,
@@ -318,6 +325,7 @@ def process_sentence(sentence, start_word="<S>", end_word="</S>"):
         process_sentence.append(end_word)
     return process_sentence
 
+
 def create_vocab(sentences, word_counts_output_file, min_word_count=1):
     """Creates the vocabulary of word to word_id, see create_vocab() and ``tutorial_tfrecord3.py``.
 
@@ -336,9 +344,9 @@ def create_vocab(sentences, word_counts_output_file, min_word_count=1):
     --------
     - tl.nlp.SimpleVocabulary object.
 
-    Mores
-    -----
-    - ``tl.nlp.build_vocab()``
+    Notes
+    -------
+    - See more ``tl.nlp.build_vocab()``
 
     Examples
     --------
@@ -375,7 +383,7 @@ def create_vocab(sentences, word_counts_output_file, min_word_count=1):
     # Filter uncommon words and sort by descending count.
     word_counts = [x for x in counter.items() if x[1] >= min_word_count]
     word_counts.sort(key=lambda x: x[1], reverse=True)
-    word_counts = [("<PAD>", 0)] + word_counts # 1st id should be reserved for padding
+    word_counts = [("<PAD>", 0)] + word_counts  # 1st id should be reserved for padding
     # print(word_counts)
     print("    Words in vocabulary: %d" % len(word_counts))
 
@@ -393,7 +401,7 @@ def create_vocab(sentences, word_counts_output_file, min_word_count=1):
     return vocab
 
 
-## Vector representations of words
+# Vector representations of words
 def simple_read_words(filename="nietzsche.txt"):
     """Read context from file without any preprocessing.
 
@@ -406,39 +414,41 @@ def simple_read_words(filename="nietzsche.txt"):
     --------
     The context in a string
     """
-    with open("nietzsche.txt", "r") as f:
+    with open(filename, "r") as f:
         words = f.read()
         return words
 
-def read_words(filename="nietzsche.txt", replace = ['\n', '<eos>']):
-    """File to list format context. Note that, this script can not handle punctuations.
+
+def read_words(filename="nietzsche.txt", replace=['\n', '<eos>']):
+    """ File to list format context. Note that, this script can not handle punctuations.
     For customized read_words method, see ``tutorial_generate_text.py``.
 
     Parameters
-    ----------
+    -----------
     filename : a string
-        A file path (like .txt file),
+        A file path (like .txt file)
     replace : a list
         [original string, target string], to disable replace use ['', '']
 
     Returns
     --------
-    The context in a list, split by space by default, and use ``'<eos>'`` to represent ``'\n'``,
+    The context in a list, split by space by default, and use ``<eos>`` to represent ``\\n``,
     e.g. ``[... 'how', 'useful', 'it', "'s" ... ]``.
 
-    Code References
+    References
     ---------------
     - `tensorflow.models.rnn.ptb.reader <https://github.com/tensorflow/tensorflow/tree/master/tensorflow/models/rnn/ptb>`_
     """
     with tf.gfile.GFile(filename, "r") as f:
         try:    # python 3.4 or older
             context_list = f.read().replace(*replace).split()
-        except: # python 3.5
+        except:  # python 3.5
             f.seek(0)
             replace = [x.encode('utf-8') for x in replace]
             context_list = f.read().replace(*replace).split()
         return context_list
 
+
 def read_analogies_file(eval_file='questions-words.txt', word2id={}):
     """Reads through an analogy question file, return its id format.
 
@@ -487,21 +497,22 @@ def read_analogies_file(eval_file='questions-words.txt', word2id={}):
     questions = []
     questions_skipped = 0
     with open(eval_file, "rb") as analogy_f:
-      for line in analogy_f:
-          if line.startswith(b":"):  # Skip comments.
+        for line in analogy_f:
+            if line.startswith(b":"):  # Skip comments.
                 continue
-          words = line.strip().lower().split(b" ")  # lowercase
-          ids = [word2id.get(w.strip()) for w in words]
-          if None in ids or len(ids) != 4:
-              questions_skipped += 1
-          else:
-              questions.append(np.array(ids))
+            words = line.strip().lower().split(b" ")  # lowercase
+            ids = [word2id.get(w.strip()) for w in words]
+            if None in ids or len(ids) != 4:
+                questions_skipped += 1
+            else:
+                questions.append(np.array(ids))
     print("Eval analogy file: ", eval_file)
     print("Questions: ", len(questions))
     print("Skipped: ", questions_skipped)
     analogy_questions = np.array(questions, dtype=np.int32)
     return analogy_questions
 
+
 def build_vocab(data):
     """Build vocabulary.
     Given the context in list format.
@@ -518,7 +529,7 @@ def build_vocab(data):
     word_to_id : a dictionary
         mapping words to unique IDs. e.g. {'campbell': 2587, 'atlantic': 2247, 'aoun': 6746 .... }
 
-    Code References
+    References
     ---------------
     - `tensorflow.models.rnn.ptb.reader <https://github.com/tensorflow/tensorflow/tree/master/tensorflow/models/rnn/ptb>`_
 
@@ -539,6 +550,7 @@ def build_vocab(data):
     # print(word_to_id) # dictionary for word to id, e.g. 'campbell': 2587, 'atlantic': 2247, 'aoun': 6746
     return word_to_id
 
+
 def build_reverse_dictionary(word_to_id):
     """Given a dictionary for converting word to integer id.
     Returns a reverse dictionary for converting a id to word.
@@ -556,7 +568,8 @@ def build_reverse_dictionary(word_to_id):
     reverse_dictionary = dict(zip(word_to_id.values(), word_to_id.keys()))
     return reverse_dictionary
 
-def build_words_dataset(words=[], vocabulary_size=50000, printable=True, unk_key = 'UNK'):
+
+def build_words_dataset(words=[], vocabulary_size=50000, printable=True, unk_key='UNK'):
     """Build the words dictionary and replace rare words with 'UNK' token.
     The most common word has the smallest integer id.
 
@@ -592,7 +605,7 @@ def build_words_dataset(words=[], vocabulary_size=50000, printable=True, unk_key
     >>> vocabulary_size = 50000
     >>> data, count, dictionary, reverse_dictionary = tl.nlp.build_words_dataset(words, vocabulary_size)
 
-    Code References
+    References
     -----------------
     - `tensorflow/examples/tutorials/word2vec/word2vec_basic.py <https://github.com/tensorflow/tensorflow/blob/r0.7/tensorflow/examples/tutorials/word2vec/word2vec_basic.py>`_
     """
@@ -616,11 +629,12 @@ def build_words_dataset(words=[], vocabulary_size=50000, printable=True, unk_key
     if printable:
         print('Real vocabulary size    %d' % len(collections.Counter(words).keys()))
         print('Limited vocabulary size {}'.format(vocabulary_size))
-    assert len(collections.Counter(words).keys()) >= vocabulary_size , \
-            "the limited vocabulary_size must be less than or equal to the read vocabulary_size"
+    assert len(collections.Counter(words).keys()) >= vocabulary_size, \
+        "the limited vocabulary_size must be less than or equal to the read vocabulary_size"
     return data, count, dictionary, reverse_dictionary
 
-def words_to_word_ids(data=[], word_to_id={}, unk_key = 'UNK'):
+
+def words_to_word_ids(data=[], word_to_id={}, unk_key='UNK'):
     """Given a context (words) in list format and the vocabulary,
     Returns a list of IDs to represent the context.
 
@@ -651,7 +665,7 @@ def words_to_word_ids(data=[], word_to_id={}, unk_key = 'UNK'):
     >>> print(context)
     ... [b'hello', b'how', b'are', b'you']
 
-    Code References
+    References
     ---------------
     - `tensorflow.models.rnn.ptb.reader <https://github.com/tensorflow/tensorflow/tree/master/tensorflow/models/rnn/ptb>`_
     """
@@ -680,6 +694,7 @@ def words_to_word_ids(data=[], word_to_id={}, unk_key = 'UNK'):
     #     # print(data[0])
     #     return [word_to_id[str(word)] f
 
+
 def word_ids_to_words(data, id_to_word):
     """Given a context (ids) in list format and the vocabulary,
     Returns a list of words to represent the context.
@@ -701,6 +716,7 @@ def word_ids_to_words(data, id_to_word):
     """
     return [id_to_word[i] for i in data]
 
+
 def save_vocab(count=[], name='vocab.txt'):
     """Save the vocabulary to a file so the model can be reloaded.
 
@@ -735,209 +751,296 @@ def save_vocab(count=[], name='vocab.txt'):
             f.write("%s %d\n" % (tf.compat.as_text(count[i][0]), count[i][1]))
     print("%d vocab saved to %s in %s" % (vocabulary_size, name, pwd))
 
-## Functions for translation
+# Functions for translation
+
+
 def basic_tokenizer(sentence, _WORD_SPLIT=re.compile(b"([.,!?\"':;)(])")):
-  """Very basic tokenizer: split the sentence into a list of tokens.
-
-  Parameters
-  -----------
-  sentence : tensorflow.python.platform.gfile.GFile Object
-  _WORD_SPLIT : regular expression for word spliting.
-
-
-  Examples
-  --------
-  >>> see create_vocabulary
-  >>> from tensorflow.python.platform import gfile
-  >>> train_path = "wmt/giga-fren.release2"
-  >>> with gfile.GFile(train_path + ".en", mode="rb") as f:
-  >>>    for line in f:
-  >>>       tokens = tl.nlp.basic_tokenizer(line)
-  >>>       print(tokens)
-  >>>       exit()
-  ... [b'Changing', b'Lives', b'|', b'Changing', b'Society', b'|', b'How',
-  ...   b'It', b'Works', b'|', b'Technology', b'Drives', b'Change', b'Home',
-  ...   b'|', b'Concepts', b'|', b'Teachers', b'|', b'Search', b'|', b'Overview',
-  ...   b'|', b'Credits', b'|', b'HHCC', b'Web', b'|', b'Reference', b'|',
-  ...   b'Feedback', b'Virtual', b'Museum', b'of', b'Canada', b'Home', b'Page']
-
-  References
-  ----------
-  - Code from ``/tensorflow/models/rnn/translation/data_utils.py``
-  """
-  words = []
-  sentence = tf.compat.as_bytes(sentence)
-  for space_separated_fragment in sentence.strip().split():
-    words.extend(re.split(_WORD_SPLIT, space_separated_fragment))
-  return [w for w in words if w]
+    """Very basic tokenizer: split the sentence into a list of tokens.
+
+    Parameters
+    -----------
+    sentence : tensorflow.python.platform.gfile.GFile Object
+    _WORD_SPLIT : regular expression for word spliting.
+
+
+    Examples
+    --------
+    >>> see create_vocabulary
+    >>> from tensorflow.python.platform import gfile
+    >>> train_path = "wmt/giga-fren.release2"
+    >>> with gfile.GFile(train_path + ".en", mode="rb") as f:
+    >>>    for line in f:
+    >>>       tokens = tl.nlp.basic_tokenizer(line)
+    >>>       print(tokens)
+    >>>       exit()
+    ... [b'Changing', b'Lives', b'|', b'Changing', b'Society', b'|', b'How',
+    ...   b'It', b'Works', b'|', b'Technology', b'Drives', b'Change', b'Home',
+    ...   b'|', b'Concepts', b'|', b'Teachers', b'|', b'Search', b'|', b'Overview',
+    ...   b'|', b'Credits', b'|', b'HHCC', b'Web', b'|', b'Reference', b'|',
+    ...   b'Feedback', b'Virtual', b'Museum', b'of', b'Canada', b'Home', b'Page']
+
+    References
+    ----------
+    - Code from ``/tensorflow/models/rnn/translation/data_utils.py``
+    """
+    words = []
+    sentence = tf.compat.as_bytes(sentence)
+    for space_separated_fragment in sentence.strip().split():
+        words.extend(re.split(_WORD_SPLIT, space_separated_fragment))
+    return [w for w in words if w]
+
 
 def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size,
                       tokenizer=None, normalize_digits=True,
                       _DIGIT_RE=re.compile(br"\d"),
                       _START_VOCAB=[b"_PAD", b"_GO", b"_EOS", b"_UNK"]):
-  """Create vocabulary file (if it does not exist yet) from data file.
-
-  Data file is assumed to contain one sentence per line. Each sentence is
-  tokenized and digits are normalized (if normalize_digits is set).
-  Vocabulary contains the most-frequent tokens up to max_vocabulary_size.
-  We write it to vocabulary_path in a one-token-per-line format, so that later
-  token in the first line gets id=0, second line gets id=1, and so on.
-
-  Parameters
-  -----------
-  vocabulary_path : path where the vocabulary will be created.
-  data_path : data file that will be used to create vocabulary.
-  max_vocabulary_size : limit on the size of the created vocabulary.
-  tokenizer : a function to use to tokenize each data sentence.
-        if None, basic_tokenizer will be used.
-  normalize_digits : Boolean
-        if true, all digits are replaced by 0s.
-
-  References
-  ----------
-  - Code from ``/tensorflow/models/rnn/translation/data_utils.py``
-  """
-  if not gfile.Exists(vocabulary_path):
-    print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path))
-    vocab = {}
-    with gfile.GFile(data_path, mode="rb") as f:
-      counter = 0
-      for line in f:
-        counter += 1
-        if counter % 100000 == 0:
-          print("  processing line %d" % counter)
-        tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
-        for w in tokens:
-          word = re.sub(_DIGIT_RE, b"0", w) if normalize_digits else w
-          if word in vocab:
-            vocab[word] += 1
-          else:
-            vocab[word] = 1
-      vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
-      if len(vocab_list) > max_vocabulary_size:
-        vocab_list = vocab_list[:max_vocabulary_size]
-      with gfile.GFile(vocabulary_path, mode="wb") as vocab_file:
-        for w in vocab_list:
-          vocab_file.write(w + b"\n")
-  else:
-    print("Vocabulary %s from data %s exists" % (vocabulary_path, data_path))
+    """Create vocabulary file (if it does not exist yet) from data file.
+
+    Data file is assumed to contain one sentence per line. Each sentence is
+    tokenized and digits are normalized (if normalize_digits is set).
+    Vocabulary contains the most-frequent tokens up to max_vocabulary_size.
+    We write it to vocabulary_path in a one-token-per-line format, so that later
+    token in the first line gets id=0, second line gets id=1, and so on.
+
+    Parameters
+    -----------
+    vocabulary_path : path where the vocabulary will be created.
+    data_path : data file that will be used to create vocabulary.
+    max_vocabulary_size : limit on the size of the created vocabulary.
+    tokenizer : a function to use to tokenize each data sentence.
+          if None, basic_tokenizer will be used.
+    normalize_digits : Boolean
+          if true, all digits are replaced by 0s.
+
+    References
+    ----------
+    - Code from ``/tensorflow/models/rnn/translation/data_utils.py``
+    """
+    if not gfile.Exists(vocabulary_path):
+        print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path))
+        vocab = {}
+        with gfile.GFile(data_path, mode="rb") as f:
+            counter = 0
+            for line in f:
+                counter += 1
+                if counter % 100000 == 0:
+                    print("  processing line %d" % counter)
+                tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
+                for w in tokens:
+                    word = re.sub(_DIGIT_RE, b"0", w) if normalize_digits else w
+                    if word in vocab:
+                        vocab[word] += 1
+                    else:
+                        vocab[word] = 1
+            vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
+            if len(vocab_list) > max_vocabulary_size:
+                vocab_list = vocab_list[:max_vocabulary_size]
+            with gfile.GFile(vocabulary_path, mode="wb") as vocab_file:
+                for w in vocab_list:
+                    vocab_file.write(w + b"\n")
+    else:
+        print("Vocabulary %s from data %s exists" % (vocabulary_path, data_path))
+
 
 def initialize_vocabulary(vocabulary_path):
-  """Initialize vocabulary from file, return the word_to_id (dictionary)
-  and id_to_word (list).
-
-  We assume the vocabulary is stored one-item-per-line, so a file:\n
-    dog\n
-    cat\n
-  will result in a vocabulary {"dog": 0, "cat": 1}, and this function will
-  also return the reversed-vocabulary ["dog", "cat"].
-
-  Parameters
-  -----------
-  vocabulary_path : path to the file containing the vocabulary.
-
-  Returns
-  --------
-  vocab : a dictionary
-        Word to id. A dictionary mapping string to integers.
-  rev_vocab : a list
-        Id to word. The reversed vocabulary (a list, which reverses the vocabulary mapping).
-
-  Examples
-  ---------
-  >>> Assume 'test' contains
-  ... dog
-  ... cat
-  ... bird
-  >>> vocab, rev_vocab = tl.nlp.initialize_vocabulary("test")
-  >>> print(vocab)
-  >>> {b'cat': 1, b'dog': 0, b'bird': 2}
-  >>> print(rev_vocab)
-  >>> [b'dog', b'cat', b'bird']
-
-  Raises
-  -------
-  ValueError : if the provided vocabulary_path does not exist.
-  """
-  if gfile.Exists(vocabulary_path):
-    rev_vocab = []
-    with gfile.GFile(vocabulary_path, mode="rb") as f:
-      rev_vocab.extend(f.readlines())
-    rev_vocab = [tf.compat.as_bytes(line.strip()) for line in rev_vocab]
-    vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
-    return vocab, rev_vocab
-  else:
-    raise ValueError("Vocabulary file %s not found.", vocabulary_path)
+    """Initialize vocabulary from file, return the word_to_id (dictionary)
+    and id_to_word (list).
+
+    We assume the vocabulary is stored one-item-per-line, so a file:\n
+      dog\n
+      cat\n
+    will result in a vocabulary {"dog": 0, "cat": 1}, and this function will
+    also return the reversed-vocabulary ["dog", "cat"].
+
+    Parameters
+    -----------
+    vocabulary_path : path to the file containing the vocabulary.
+
+    Returns
+    --------
+    vocab : a dictionary
+          Word to id. A dictionary mapping string to integers.
+    rev_vocab : a list
+          Id to word. The reversed vocabulary (a list, which reverses the vocabulary mapping).
+
+    Examples
+    ---------
+    >>> Assume 'test' contains
+    ... dog
+    ... cat
+    ... bird
+    >>> vocab, rev_vocab = tl.nlp.initialize_vocabulary("test")
+    >>> print(vocab)
+    >>> {b'cat': 1, b'dog': 0, b'bird': 2}
+    >>> print(rev_vocab)
+    >>> [b'dog', b'cat', b'bird']
+
+    Raises
+    -------
+    ValueError : if the provided vocabulary_path does not exist.
+    """
+    if gfile.Exists(vocabulary_path):
+        rev_vocab = []
+        with gfile.GFile(vocabulary_path, mode="rb") as f:
+            rev_vocab.extend(f.readlines())
+        rev_vocab = [tf.compat.as_bytes(line.strip()) for line in rev_vocab]
+        vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
+        return vocab, rev_vocab
+    else:
+        raise ValueError("Vocabulary file %s not found.", vocabulary_path)
+
 
 def sentence_to_token_ids(sentence, vocabulary,
                           tokenizer=None, normalize_digits=True,
                           UNK_ID=3, _DIGIT_RE=re.compile(br"\d")):
-  """Convert a string to list of integers representing token-ids.
-
-  For example, a sentence "I have a dog" may become tokenized into
-  ["I", "have", "a", "dog"] and with vocabulary {"I": 1, "have": 2,
-  "a": 4, "dog": 7"} this function will return [1, 2, 4, 7].
-
-  Parameters
-  -----------
-  sentence :  tensorflow.python.platform.gfile.GFile Object
-        The sentence in bytes format to convert to token-ids.\n
-        see basic_tokenizer(), data_to_token_ids()
-  vocabulary : a dictionary mapping tokens to integers.
-  tokenizer : a function to use to tokenize each sentence;
-        If None, basic_tokenizer will be used.
-  normalize_digits : Boolean
-        If true, all digits are replaced by 0s.
-
-  Returns
-  --------
-  A list of integers, the token-ids for the sentence.
-  """
-
-  if tokenizer:
-    words = tokenizer(sentence)
-  else:
-    words = basic_tokenizer(sentence)
-  if not normalize_digits:
-    return [vocabulary.get(w, UNK_ID) for w in words]
-  # Normalize digits by 0 before looking words up in the vocabulary.
-  return [vocabulary.get(re.sub(_DIGIT_RE, b"0", w), UNK_ID) for w in words]
+    """Convert a string to list of integers representing token-ids.
+
+    For example, a sentence "I have a dog" may become tokenized into
+    ["I", "have", "a", "dog"] and with vocabulary {"I": 1, "have": 2,
+    "a": 4, "dog": 7"} this function will return [1, 2, 4, 7].
+
+    Parameters
+    -----------
+    sentence :  tensorflow.python.platform.gfile.GFile Object
+          The sentence in bytes format to convert to token-ids.\n
+          see basic_tokenizer(), data_to_token_ids()
+    vocabulary : a dictionary mapping tokens to integers.
+    tokenizer : a function to use to tokenize each sentence;
+          If None, basic_tokenizer will be used.
+    normalize_digits : Boolean
+          If true, all digits are replaced by 0s.
+
+    Returns
+    --------
+    A list of integers, the token-ids for the sentence.
+    """
+
+    if tokenizer:
+        words = tokenizer(sentence)
+    else:
+        words = basic_tokenizer(sentence)
+    if not normalize_digits:
+        return [vocabulary.get(w, UNK_ID) for w in words]
+    # Normalize digits by 0 before looking words up in the vocabulary.
+    return [vocabulary.get(re.sub(_DIGIT_RE, b"0", w), UNK_ID) for w in words]
+
 
 def data_to_token_ids(data_path, target_path, vocabulary_path,
                       tokenizer=None, normalize_digits=True,
                       UNK_ID=3, _DIGIT_RE=re.compile(br"\d")):
-  """Tokenize data file and turn into token-ids using given vocabulary file.
-
-  This function loads data line-by-line from data_path, calls the above
-  sentence_to_token_ids, and saves the result to target_path. See comment
-  for sentence_to_token_ids on the details of token-ids format.
-
-  Parameters
-  -----------
-  data_path : path to the data file in one-sentence-per-line format.
-  target_path : path where the file with token-ids will be created.
-  vocabulary_path : path to the vocabulary file.
-  tokenizer : a function to use to tokenize each sentence;
-      if None, basic_tokenizer will be used.
-  normalize_digits : Boolean; if true, all digits are replaced by 0s.
-
-  References
-  ----------
-  - Code from ``/tensorflow/models/rnn/translation/data_utils.py``
-  """
-  if not gfile.Exists(target_path):
-    print("Tokenizing data in %s" % data_path)
-    vocab, _ = initialize_vocabulary(vocabulary_path)
-    with gfile.GFile(data_path, mode="rb") as data_file:
-      with gfile.GFile(target_path, mode="w") as tokens_file:
-        counter = 0
-        for line in data_file:
-          counter += 1
-          if counter % 100000 == 0:
-            print("  tokenizing line %d" % counter)
-          token_ids = sentence_to_token_ids(line, vocab, tokenizer,
-                                            normalize_digits, UNK_ID=UNK_ID,
-                                            _DIGIT_RE=_DIGIT_RE)
-          tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
-  else:
-    print("Target path %s exists" % target_path)
+    """Tokenize data file and turn into token-ids using given vocabulary file.
+
+    This function loads data line-by-line from data_path, calls the above
+    sentence_to_token_ids, and saves the result to target_path. See comment
+    for sentence_to_token_ids on the details of token-ids format.
+
+    Parameters
+    -----------
+    data_path : path to the data file in one-sentence-per-line format.
+    target_path : path where the file with token-ids will be created.
+    vocabulary_path : path to the vocabulary file.
+    tokenizer : a function to use to tokenize each sentence;
+        if None, basic_tokenizer will be used.
+    normalize_digits : Boolean; if true, all digits are replaced by 0s.
+
+    References
+    ----------
+    - Code from ``/tensorflow/models/rnn/translation/data_utils.py``
+    """
+    if not gfile.Exists(target_path):
+        print("Tokenizing data in %s" % data_path)
+        vocab, _ = initialize_vocabulary(vocabulary_path)
+        with gfile.GFile(data_path, mode="rb") as data_file:
+            with gfile.GFile(target_path, mode="w") as tokens_file:
+                counter = 0
+                for line in data_file:
+                    counter += 1
+                    if counter % 100000 == 0:
+                        print("  tokenizing line %d" % counter)
+                    token_ids = sentence_to_token_ids(line, vocab, tokenizer,
+                                                      normalize_digits, UNK_ID=UNK_ID,
+                                                      _DIGIT_RE=_DIGIT_RE)
+                    tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
+    else:
+        print("Target path %s exists" % target_path)
+
+
+# Metric
+import subprocess
+import tempfile
+from six.moves import urllib
+
+
+def moses_multi_bleu(hypotheses, references, lowercase=False):  # tl.nlp
+    """Calculate the bleu score for hypotheses and references
+    using the MOSES ulti-bleu.perl script.
+
+    Parameters
+    ------------
+    hypotheses : A numpy array of strings where each string is a single example.
+    references : A numpy array of strings where each string is a single example.
+    lowercase : If true, pass the "-lc" flag to the multi-bleu script
+
+    Examples
+    ---------
+    >>> hypotheses = ["a bird is flying on the sky"]
+    >>> references = ["two birds are flying on the sky", "a bird is on the top of the tree", "an airplane is on the sky",]
+    >>> score = tl.nlp.moses_multi_bleu(hypotheses, references)
+
+    Returns
+    --------
+    The BLEU score as a float32 value.
+
+    References
+    ----------
+    - `Google/seq2seq/metric/bleu <https://github.com/google/seq2seq>`_
+    """
+
+    if np.size(hypotheses) == 0:
+        return np.float32(0.0)
+
+    # Get MOSES multi-bleu script
+    try:
+        multi_bleu_path, _ = urllib.request.urlretrieve(
+            "https://raw.githubusercontent.com/moses-smt/mosesdecoder/"
+            "master/scripts/generic/multi-bleu.perl")
+        os.chmod(multi_bleu_path, 0o755)
+    except:  # pylint: disable=W0702
+        tf.logging.info("Unable to fetch multi-bleu.perl script, using local.")
+        metrics_dir = os.path.dirname(os.path.realpath(__file__))
+        bin_dir = os.path.abspath(os.path.join(metrics_dir, "..", "..", "bin"))
+        multi_bleu_path = os.path.join(bin_dir, "tools/multi-bleu.perl")
+
+    # Dump hypotheses and references to tempfiles
+    hypothesis_file = tempfile.NamedTemporaryFile()
+    hypothesis_file.write("\n".join(hypotheses).encode("utf-8"))
+    hypothesis_file.write(b"\n")
+    hypothesis_file.flush()
+    reference_file = tempfile.NamedTemporaryFile()
+    reference_file.write("\n".join(references).encode("utf-8"))
+    reference_file.write(b"\n")
+    reference_file.flush()
+
+    # Calculate BLEU using multi-bleu script
+    with open(hypothesis_file.name, "r") as read_pred:
+        bleu_cmd = [multi_bleu_path]
+        if lowercase:
+            bleu_cmd += ["-lc"]
+        bleu_cmd += [reference_file.name]
+        try:
+            bleu_out = subprocess.check_output(
+                bleu_cmd, stdin=read_pred, stderr=subprocess.STDOUT)
+            bleu_out = bleu_out.decode("utf-8")
+            bleu_score = re.search(r"BLEU = (.+?),", bleu_out).group(1)
+            bleu_score = float(bleu_score)
+        except subprocess.CalledProcessError as error:
+            if error.output is not None:
+                tf.logging.warning("multi-bleu.perl script returned non-zero exit code")
+                tf.logging.warning(error.output)
+            bleu_score = np.float32(0.0)
+
+    # Close temp files
+    hypothesis_file.close()
+    reference_file.close()
+
+    return np.float32(bleu_score)
diff --git a/tensorlayer/ops.py b/tensorlayer/ops.py
index 608799c6..012d1b68 100644
--- a/tensorlayer/ops.py
+++ b/tensorlayer/ops.py
@@ -1,40 +1,76 @@
 #! /usr/bin/python
-# -*- coding: utf8 -*-
+# -*- coding: utf-8 -*-
 
 
 
 
 import tensorflow as tf
+import tensorlayer as tl
 import os
+import subprocess
 import sys
 from sys import platform as _platform
+from sys import exit as _exit
 
 
-def exit_tf(sess=None):
-    """Close tensorboard and nvidia-process if available
+def exit_tf(sess=None, port=6006):
+    """Close TensorFlow session, TensorBoard and Nvidia-process if available.
 
     Parameters
     ----------
     sess : a session instance of TensorFlow
         TensorFlow session
+    tb_port : an integer
+        TensorBoard port you want to close, 6006 as default.
     """
-    text = "[tl] Close tensorboard and nvidia-process if available"
-    sess.close()
+    text = "[TL] Close tensorboard and nvidia-process if available"
+    text2 = "[TL] Close tensorboard and nvidia-process not yet supported by this function (tl.ops.exit_tf) on "
+    if sess != None:
+        sess.close()
     # import time
     # time.sleep(2)
     if _platform == "linux" or _platform == "linux2":
         print('linux: %s' % text)
         os.system('nvidia-smi')
-        os.system('fuser 6006/tcp -k')  # kill tensorboard 6006
+        os.system('fuser '+ port +'/tcp -k')  # kill tensorboard 6006
         os.system("nvidia-smi | grep python |awk '{print $3}'|xargs kill") # kill all nvidia-smi python process
+        _exit()
     elif _platform == "darwin":
         print('OS X: %s' % text)
-        os.system("lsof -i tcp:6006 | grep -v PID | awk '{print $2}' | xargs kill") # kill tensorboard 6006
+        subprocess.Popen("lsof -i tcp:"+ str(port) +"  | grep -v PID | awk '{print $2}' | xargs kill", shell=True) # kill tensorboard
     elif _platform == "win32":
-        print('Windows: %s' % text)
+        print(text2 + "Windows")
+        # TODO
     else:
-        print(_platform)
-    exit()
+        print(text2 + _platform)
+
+def open_tb(logdir='/tmp/tensorflow', port=6006):
+    """Open Tensorboard.
+
+    Parameters
+    ----------
+    logdir : a string
+        Directory where your tensorboard logs are saved
+    port : an integer
+        TensorBoard port you want to open, 6006 is tensorboard default
+    """
+    text = "[TL] Open tensorboard, go to localhost:" + str(port) + " to access"
+    text2 = " not yet supported by this function (tl.ops.open_tb)"
+
+    if not tl.files.exists_or_mkdir(logdir, verbose=False):
+        print("[TL] Log reportory was created at %s" % logdir)
+
+    if _platform == "linux" or _platform == "linux2":
+        print('linux %s' % text2)
+        # TODO
+    elif _platform == "darwin":
+        print('OS X: %s' % text)
+        subprocess.Popen(sys.prefix + " | python -m tensorflow.tensorboard --logdir=" + logdir + " --port=" + str(port), shell=True) # open tensorboard in localhost:6006/ or whatever port you chose
+    elif _platform == "win32":
+        print('Windows%s' % text2)
+        # TODO
+    else:
+        print(_platform + text2)
 
 def clear_all(printable=True):
     """Clears all the placeholder variables of keep prob,
@@ -92,13 +128,26 @@ def set_gpu_fraction(sess=None, gpu_fraction=0.3):
     ----------
     - `TensorFlow using GPU <https://www.tensorflow.org/versions/r0.9/how_tos/using_gpu/index.html>`_
     """
-    print("  tensorlayer: GPU MEM Fraction %f" % gpu_fraction)
+    print("[TL]: GPU MEM Fraction %f" % gpu_fraction)
     gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction)
     sess = tf.Session(config = tf.ConfigProto(gpu_options = gpu_options))
     return sess
 
 
+def setlinebuf():
+    """Set buffer mode to _IOLBF for stdout.
+    When running in container, or other environments where stdout is redirected,
+    the default buffer behavior will seriously delay the message written by `print`.
 
+    TODO: this method should be called automatically by default.
+
+    References
+    -----------
+    - `<https://docs.python.org/2/library/functions.html#open>`_
+    - `<https://docs.python.org/3/library/functions.html#open>`_
+    - `man setlinebuf <https://linux.die.net/man/3/setlinebuf>`_
+    """
+    sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 1)
 
 
 def disable_print():
@@ -184,10 +233,10 @@ def get_site_packages_directory():
     import site
     try:
         loc = site.getsitepackages()
-        print("  tl.ops : site-packages in ", loc)
+        print("[TL] tl.ops : site-packages in ", loc)
         return loc
     except:
-        print("  tl.ops : Cannot find package dir from virtual environment")
+        print("[TL] tl.ops : Cannot find package dir from virtual environment")
         return False
 
 
@@ -196,7 +245,7 @@ def empty_trash():
     """Empty trash folder.
 
     """
-    text = "[tl] Empty the trash"
+    text = "[TL] Empty the trash"
     if _platform == "linux" or _platform == "linux2":
         print('linux: %s' % text)
         os.system("rm -rf ~/.local/share/Trash/*")
diff --git a/tensorlayer/prepro.py b/tensorlayer/prepro.py
index 0503b869..11272a1e 100644
--- a/tensorlayer/prepro.py
+++ b/tensorlayer/prepro.py
@@ -1,5 +1,5 @@
 #! /usr/bin/python
-# -*- coding: utf8 -*-
+# -*- coding: utf-8 -*-
 
 
 import tensorflow as tf
@@ -30,17 +30,20 @@
 from skimage import exposure
 import skimage
 
+from multiprocessing import Pool
+
 # linalg https://docs.scipy.org/doc/scipy/reference/linalg.html
 # ndimage https://docs.scipy.org/doc/scipy/reference/ndimage.html
 
 ## Threading
-def threading_data(data=None, fn=None, **kwargs):
+def threading_data(data=None, fn=None, thread_count=None, **kwargs):
     """Return a batch of result by given data.
     Usually be used for data augmentation.
 
     Parameters
     -----------
-    data : numpy array or zip of numpy array, see Examples below.
+    data : numpy array, file names and etc, see Examples below.
+    thread_count : the number of threads to use
     fn : the function for data processing.
     more args : the args for fn, see Examples below.
 
@@ -62,6 +65,15 @@ def threading_data(data=None, fn=None, **kwargs):
     >>> tl.visualize.images2d(images=np.asarray(X_), second=0.01, saveable=True, name='after', dtype=None)
     >>> tl.visualize.images2d(images=np.asarray(Y_), second=0.01, saveable=True, name='before', dtype=None)
 
+    - Single array split across ``thread_count`` threads (e.g. functions with ``multi``)
+    >>> X, Y --> [batch_size, row, col, 1]  greyscale
+    >>> data = threading_data(X, zoom_multi, 8, zoom_range=[0.5, 1], is_random=True)
+    ... data --> [batch_size, 2, row, col, 1]
+    >>> X_, Y_ = data.transpose((1,0,2,3,4))
+    ... X_, Y_ --> [batch_size, row, col, 1]
+    >>> tl.visualize.images2d(images=np.asarray(X_), second=0.01, saveable=True, name='after', dtype=None)
+    >>> tl.visualize.images2d(images=np.asarray(Y_), second=0.01, saveable=True, name='before', dtype=None)
+
     - Customized function for image segmentation
     >>> def distort_img(data):
     ...     x, y = data
@@ -87,58 +99,47 @@ def apply_fn(results, i, data, kwargs):
         results[i] = fn(data, **kwargs)
 
     ## start multi-threaded reading.
-    results = [None] * len(data) ## preallocate result list
-    threads = []
-    for i in range(len(data)):
-        t = threading.Thread(
-                        name='threading_and_return',
-                        target=apply_fn,
-                        args=(results, i, data[i], kwargs)
-                        )
-        t.start()
-        threads.append(t)
+    if thread_count is None: # by Milo
+        results = [None] * len(data) ## preallocate result list
+        threads = []
+        for i in range(len(data)):
+            t = threading.Thread(
+                            name='threading_and_return',
+                            target=apply_fn,
+                            args=(results, i, data[i], kwargs)
+                            )
+            t.start()
+            threads.append(t)
+    else: # by geometrikal
+        divs = np.linspace(0, len(data), thread_count + 1)
+        divs = np.round(divs).astype(int)
+        results = [None] * thread_count
+        threads = []
+        for i in range(thread_count):
+            t = threading.Thread(
+                name='threading_and_return',
+                target=apply_fn,
+                args=(results, i, data[divs[i]:divs[i + 1]], kwargs)
+            )
+            t.start()
+            threads.append(t)
 
     ## <Milo> wait for all threads to complete
     for t in threads:
         t.join()
 
-    return np.asarray(results)
-
-    ## old implementation
-    # define function for threading
-    # def function(q, i, data, kwargs):
-    #     result = fn(data, **kwargs)
-    #     q.put([i, result])
-    # ## start threading
-    # q = queue.Queue()
-    # threads = []
-    # for i in range(len(data)):
-    #     t = threading.Thread(
-    #                     name='threading_and_return',
-    #                     target=function,
-    #                     args=(q, i, data[i], kwargs)
-    #                     )
-    #     t.start()
-    #     threads.append(t)
-    #
-    # ## <Milo> wait for all threads to complete
-    # for t in threads:
-    #     t.join()
-    #
-    # ## get results
-    # results = []
-    # for i in range(len(data)):
-    #     result = q.get()
-    #     results.append(result)
-    # results = sorted(results)
-    # for i in range(len(results)):
-    #     results[i] = results[i][1]
-    # return np.asarray(results)
+    if thread_count is None:
+        try:
+            return np.asarray(results)
+        except:     # if dim don't match
+            return results
+    else:
+        return np.concatenate(results)
 
 
 ## Image
 def rotation(x, rg=20, is_random=False, row_index=0, col_index=1, channel_index=2,
-                    fill_mode='nearest', cval=0.):
+                    fill_mode='nearest', cval=0., order=1):
     """Rotate an image randomly or non-randomly.
 
     Parameters
@@ -157,6 +158,8 @@ def rotation(x, rg=20, is_random=False, row_index=0, col_index=1, channel_index=
         - `scipy ndimage affine_transform <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.ndimage.interpolation.affine_transform.html>`_
     cval : scalar, optional
         Value used for points outside the boundaries of the input if mode='constant'. Default is 0.0
+    order : int, optional
+        The order of interpolation. The order has to be in the range 0-5. See ``apply_transform``.
 
         - `scipy ndimage affine_transform <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.ndimage.interpolation.affine_transform.html>`_
 
@@ -176,11 +179,11 @@ def rotation(x, rg=20, is_random=False, row_index=0, col_index=1, channel_index=
 
     h, w = x.shape[row_index], x.shape[col_index]
     transform_matrix = transform_matrix_offset_center(rotation_matrix, h, w)
-    x = apply_transform(x, transform_matrix, channel_index, fill_mode, cval)
+    x = apply_transform(x, transform_matrix, channel_index, fill_mode, cval, order)
     return x
 
 def rotation_multi(x, rg=20, is_random=False, row_index=0, col_index=1, channel_index=2,
-                    fill_mode='nearest', cval=0.):
+                    fill_mode='nearest', cval=0., order=1):
     """Rotate multiple images with the same arguments, randomly or non-randomly.
     Usually be used for image segmentation which x=[X, Y], X and Y should be matched.
 
@@ -209,7 +212,7 @@ def rotation_multi(x, rg=20, is_random=False, row_index=0, col_index=1, channel_
     transform_matrix = transform_matrix_offset_center(rotation_matrix, h, w)
     results = []
     for data in x:
-        results.append( apply_transform(data, transform_matrix, channel_index, fill_mode, cval))
+        results.append( apply_transform(data, transform_matrix, channel_index, fill_mode, cval, order))
     return np.asarray(results)
 
 # crop
@@ -220,9 +223,9 @@ def crop(x, wrg, hrg, is_random=False, row_index=0, col_index=1, channel_index=2
     ----------
     x : numpy array
         An image with dimension of [row, col, channel] (default).
-    wrg : float
-        Size of weight.
-    hrg : float
+    wrg : int
+        Size of width.
+    hrg : int
         Size of height.
     is_random : boolean, default False
         If True, randomly crop, else central crop.
@@ -278,7 +281,7 @@ def crop_multi(x, wrg, hrg, is_random=False, row_index=0, col_index=1, channel_i
         return np.asarray(results)
 
 # flip
-def flip_axis(x, axis, is_random=False):
+def flip_axis(x, axis=1, is_random=False):
     """Flip the axis of an image, such as flip left and right, up and down, randomly or non-randomly,
 
     Parameters
@@ -347,7 +350,7 @@ def flip_axis_multi(x, axis, is_random=False):
 
 # shift
 def shift(x, wrg=0.1, hrg=0.1, is_random=False, row_index=0, col_index=1, channel_index=2,
-                 fill_mode='nearest', cval=0.):
+                 fill_mode='nearest', cval=0., order=1):
     """Shift an image randomly or non-randomly.
 
     Parameters
@@ -368,6 +371,8 @@ def shift(x, wrg=0.1, hrg=0.1, is_random=False, row_index=0, col_index=1, channe
         - `scipy ndimage affine_transform <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.ndimage.interpolation.affine_transform.html>`_
     cval : scalar, optional
         Value used for points outside the boundaries of the input if mode='constant'. Default is 0.0.
+    order : int, optional
+        The order of interpolation. The order has to be in the range 0-5. See ``apply_transform``.
 
         - `scipy ndimage affine_transform <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.ndimage.interpolation.affine_transform.html>`_
     """
@@ -382,11 +387,11 @@ def shift(x, wrg=0.1, hrg=0.1, is_random=False, row_index=0, col_index=1, channe
                                    [0, 0, 1]])
 
     transform_matrix = translation_matrix  # no need to do offset
-    x = apply_transform(x, transform_matrix, channel_index, fill_mode, cval)
+    x = apply_transform(x, transform_matrix, channel_index, fill_mode, cval, order)
     return x
 
 def shift_multi(x, wrg=0.1, hrg=0.1, is_random=False, row_index=0, col_index=1, channel_index=2,
-                 fill_mode='nearest', cval=0.):
+                 fill_mode='nearest', cval=0., order=1):
     """Shift images with the same arguments, randomly or non-randomly.
     Usually be used for image segmentation which x=[X, Y], X and Y should be matched.
 
@@ -409,12 +414,12 @@ def shift_multi(x, wrg=0.1, hrg=0.1, is_random=False, row_index=0, col_index=1,
     transform_matrix = translation_matrix  # no need to do offset
     results = []
     for data in x:
-        results.append( apply_transform(data, transform_matrix, channel_index, fill_mode, cval))
+        results.append( apply_transform(data, transform_matrix, channel_index, fill_mode, cval, order))
     return np.asarray(results)
 
 # shear
 def shear(x, intensity=0.1, is_random=False, row_index=0, col_index=1, channel_index=2,
-                 fill_mode='nearest', cval=0.):
+                 fill_mode='nearest', cval=0., order=1):
     """Shear an image randomly or non-randomly.
 
     Parameters
@@ -434,8 +439,14 @@ def shear(x, intensity=0.1, is_random=False, row_index=0, col_index=1, channel_i
         - `scipy ndimage affine_transform <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.ndimage.interpolation.affine_transform.html>`_
     cval : scalar, optional
         Value used for points outside the boundaries of the input if mode='constant'. Default is 0.0.
+    order : int, optional
+        The order of interpolation. The order has to be in the range 0-5. See ``apply_transform``.
 
         - `scipy ndimage affine_transform <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.ndimage.interpolation.affine_transform.html>`_
+
+    References
+    -----------
+    - `Affine transformation <https://uk.mathworks.com/discovery/affine-transformation.html>`_
     """
     if is_random:
         shear = np.random.uniform(-intensity, intensity)
@@ -447,11 +458,11 @@ def shear(x, intensity=0.1, is_random=False, row_index=0, col_index=1, channel_i
 
     h, w = x.shape[row_index], x.shape[col_index]
     transform_matrix = transform_matrix_offset_center(shear_matrix, h, w)
-    x = apply_transform(x, transform_matrix, channel_index, fill_mode, cval)
+    x = apply_transform(x, transform_matrix, channel_index, fill_mode, cval, order)
     return x
 
 def shear_multi(x, intensity=0.1, is_random=False, row_index=0, col_index=1, channel_index=2,
-                 fill_mode='nearest', cval=0.):
+                 fill_mode='nearest', cval=0., order=1):
     """Shear images with the same arguments, randomly or non-randomly.
     Usually be used for image segmentation which x=[X, Y], X and Y should be matched.
 
@@ -459,7 +470,7 @@ def shear_multi(x, intensity=0.1, is_random=False, row_index=0, col_index=1, cha
     -----------
     x : list of numpy array
         List of images with dimension of [n_images, row, col, channel] (default).
-    others : see ``shear``.
+    others : see ``tl.prepro.shear``.
     """
     if is_random:
         shear = np.random.uniform(-intensity, intensity)
@@ -473,7 +484,77 @@ def shear_multi(x, intensity=0.1, is_random=False, row_index=0, col_index=1, cha
     transform_matrix = transform_matrix_offset_center(shear_matrix, h, w)
     results = []
     for data in x:
-        results.append( apply_transform(data, transform_matrix, channel_index, fill_mode, cval))
+        results.append( apply_transform(data, transform_matrix, channel_index, fill_mode, cval, order))
+    return np.asarray(results)
+
+def shear2(x, shear=(0.1, 0.1), is_random=False, row_index=0, col_index=1, channel_index=2,
+                 fill_mode='nearest', cval=0., order=1):
+    """Shear an image randomly or non-randomly.
+
+    Parameters
+    -----------
+    x : numpy array
+        An image with dimension of [row, col, channel] (default).
+    shear : tuple of two floats
+        Percentage of shear for height and width direction (0, 1).
+    is_random : boolean, default False
+        If True, randomly shear.
+    row_index, col_index, channel_index : int
+        Index of row, col and channel, default (0, 1, 2), for theano (1, 2, 0).
+    fill_mode : string
+        Method to fill missing pixel, default ‘nearest’, more options ‘constant’, ‘reflect’ or ‘wrap’.
+
+        - `scipy ndimage affine_transform <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.ndimage.interpolation.affine_transform.html>`_
+    cval : scalar, optional
+        Value used for points outside the boundaries of the input if mode='constant'. Default is 0.0.
+    order : int, optional
+        The order of interpolation. The order has to be in the range 0-5. See ``apply_transform``.
+
+        - `scipy ndimage affine_transform <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.ndimage.interpolation.affine_transform.html>`_
+
+    References
+    -----------
+    - `Affine transformation <https://uk.mathworks.com/discovery/affine-transformation.html>`_
+    """
+    assert len(shear) == 2, "shear should be tuple of 2 floats, or you want to use tl.prepro.shear rather than tl.prepro.shear2 ?"
+    if is_random:
+        shear[0] = np.random.uniform(-shear[0], shear[0])
+        shear[1] = np.random.uniform(-shear[1], shear[1])
+
+    shear_matrix = np.array([[1, shear[0], 0],
+                             [shear[1], 1, 0],
+                             [0, 0, 1]])
+
+    h, w = x.shape[row_index], x.shape[col_index]
+    transform_matrix = transform_matrix_offset_center(shear_matrix, h, w)
+    x = apply_transform(x, transform_matrix, channel_index, fill_mode, cval, order)
+    return x
+
+def shear_multi2(x, shear=(0.1, 0.1), is_random=False, row_index=0, col_index=1, channel_index=2,
+                 fill_mode='nearest', cval=0., order=1):
+    """Shear images with the same arguments, randomly or non-randomly.
+    Usually be used for image segmentation which x=[X, Y], X and Y should be matched.
+
+    Parameters
+    -----------
+    x : list of numpy array
+        List of images with dimension of [n_images, row, col, channel] (default).
+    others : see ``tl.prepro.shear2``.
+    """
+    assert len(shear) == 2, "shear should be tuple of 2 floats, or you want to use tl.prepro.shear_multi rather than tl.prepro.shear_multi2 ?"
+    if is_random:
+        shear[0] = np.random.uniform(-shear[0], shear[0])
+        shear[1] = np.random.uniform(-shear[1], shear[1])
+
+    shear_matrix = np.array([[1, shear[0], 0],
+                             [shear[1], 1, 0],
+                             [0, 0, 1]])
+
+    h, w = x[0].shape[row_index], x[0].shape[col_index]
+    transform_matrix = transform_matrix_offset_center(shear_matrix, h, w)
+    results = []
+    for data in x:
+        results.append( apply_transform(data, transform_matrix, channel_index, fill_mode, cval, order))
     return np.asarray(results)
 
 # swirl
@@ -661,7 +742,7 @@ def elastic_transform_multi(x, alpha, sigma, mode="constant", cval=0, is_random=
 
 # zoom
 def zoom(x, zoom_range=(0.9, 1.1), is_random=False, row_index=0, col_index=1, channel_index=2,
-                fill_mode='nearest', cval=0.):
+                fill_mode='nearest', cval=0., order=1):
     """Zoom in and out of a single image, randomly or non-randomly.
 
     Parameters
@@ -670,7 +751,7 @@ def zoom(x, zoom_range=(0.9, 1.1), is_random=False, row_index=0, col_index=1, ch
         An image with dimension of [row, col, channel] (default).
     zoom_range : list or tuple
         - If is_random=False, (h, w) are the fixed zoom factor for row and column axies, factor small than one is zoom in.
-        - If is_random=True, (min zoom out, max zoom out) for x and y with different random zoom in/out factor.
+        - If is_random=True, it is (min zoom out, max zoom out) for x and y with different random zoom in/out factor.
         e.g (0.5, 1) zoom in 1~2 times.
     is_random : boolean, default False
         If True, randomly zoom.
@@ -682,6 +763,8 @@ def zoom(x, zoom_range=(0.9, 1.1), is_random=False, row_index=0, col_index=1, ch
         - `scipy ndimage affine_transform <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.ndimage.interpolation.affine_transform.html>`_
     cval : scalar, optional
         Value used for points outside the boundaries of the input if mode='constant'. Default is 0.0.
+    order : int, optional
+        The order of interpolation. The order has to be in the range 0-5. See ``apply_transform``.
 
         - `scipy ndimage affine_transform <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.ndimage.interpolation.affine_transform.html>`_
     """
@@ -703,11 +786,11 @@ def zoom(x, zoom_range=(0.9, 1.1), is_random=False, row_index=0, col_index=1, ch
 
     h, w = x.shape[row_index], x.shape[col_index]
     transform_matrix = transform_matrix_offset_center(zoom_matrix, h, w)
-    x = apply_transform(x, transform_matrix, channel_index, fill_mode, cval)
+    x = apply_transform(x, transform_matrix, channel_index, fill_mode, cval, order)
     return x
 
 def zoom_multi(x, zoom_range=(0.9, 1.1), is_random=False,
-        row_index=0, col_index=1, channel_index=2, fill_mode='nearest', cval=0.):
+        row_index=0, col_index=1, channel_index=2, fill_mode='nearest', cval=0., order=1):
     """Zoom in and out of images with the same arguments, randomly or non-randomly.
     Usually be used for image segmentation which x=[X, Y], X and Y should be matched.
 
@@ -740,7 +823,7 @@ def zoom_multi(x, zoom_range=(0.9, 1.1), is_random=False,
     # return x
     results = []
     for data in x:
-        results.append( apply_transform(data, transform_matrix, channel_index, fill_mode, cval))
+        results.append( apply_transform(data, transform_matrix, channel_index, fill_mode, cval, order))
     return np.asarray(results)
 
 # image = tf.image.random_brightness(image, max_delta=32. / 255.)
@@ -757,7 +840,7 @@ def brightness(x, gamma=1, gain=1, is_random=False):
     x : numpy array
         An image with dimension of [row, col, channel] (default).
     gamma : float, small than 1 means brighter.
-        Non negative real number. Default value is 1.
+        Non negative real number. Default value is 1, smaller means brighter.
 
         - If is_random is True, gamma in a range of (1-gamma, 1+gamma).
     gain : float
@@ -793,19 +876,185 @@ def brightness_multi(x, gamma=1, gain=1, is_random=False):
         results.append( exposure.adjust_gamma(data, gamma, gain) )
     return np.asarray(results)
 
+# illumination
+def illumination(x, gamma=1., contrast=1., saturation=1., is_random=False):
+    """Perform illumination augmentation for a single image, randomly or non-randomly.
 
-# contrast
-def constant(x, cutoff=0.5, gain=10, inv=False, is_random=False):
-    # TODO
-    x = exposure.adjust_sigmoid(x, cutoff=cutoff, gain=gain, inv=inv)
-    return x
+    Parameters
+    -----------
+    x : numpy array
+        an image with dimension of [row, col, channel] (default).
+    gamma : change brightness (the same with ``tl.prepro.brightness``)
+        - if is_random=False, one float number, small than one means brighter, greater than one means darker.
+        - if is_random=True, tuple of two float numbers, (min, max).
+    contrast : change contrast
+        - if is_random=False, one float number, small than one means blur.
+        - if is_random=True, tuple of two float numbers, (min, max).
+    saturation : change saturation
+        - if is_random=False, one float number, small than one means unsaturation.
+        - if is_random=True, tuple of two float numbers, (min, max).
+    is_random : whether the parameters are randomly set.
+
+    Examples
+    ---------
+    - Random
+    >>> x = illumination(x, gamma=(0.5, 5.0), contrast=(0.3, 1.0), saturation=(0.7, 1.0), is_random=True)
+    - Non-random
+    >>> x = illumination(x, 0.5, 0.6, 0.8, is_random=False)
+    """
+    from PIL import Image, ImageEnhance
+
+    if is_random:
+        try:
+            assert len(gamma) == len(contrast) == len(saturation) == 2, "if is_random = True, the arguments are (min, max)"
+        except:
+            raise Exception("if is_random = True, the arguments are (min, max)")
+        ## random change brightness  # small --> brighter
+        illum_settings = np.random.randint(0,3) # 0-brighter, 1-darker, 2 keep normal
+
+        if illum_settings == 0: # brighter
+            gamma = np.random.uniform(gamma[0], 1.0) # (.5, 1.0)
+        elif illum_settings == 1: # darker
+            gamma = np.random.uniform(1.0, gamma[1])# (1.0, 5.0)
+        else:
+            gamma = 1
+        im_ = brightness(x, gamma=gamma, gain=1, is_random=False)
+
+        # print("using contrast and saturation")
+        image = Image.fromarray(im_) # array -> PIL
+        contrast_adjust = ImageEnhance.Contrast(image)
+        image = contrast_adjust.enhance(np.random.uniform(contrast[0], contrast[1]))#0.3,0.9))
+
+        saturation_adjust = ImageEnhance.Color(image)
+        image = saturation_adjust.enhance(np.random.uniform(saturation[0], saturation[1]))# (0.7,1.0))
+        im_ = np.array(image) # PIL -> array
+    else:
+        im_ = brightness(x, gamma=gamma, gain=1, is_random=False)
+        image = Image.fromarray(im_) # array -> PIL
+        contrast_adjust = ImageEnhance.Contrast(image)
+        image = contrast_adjust.enhance(contrast)
+
+        saturation_adjust = ImageEnhance.Color(image)
+        image = saturation_adjust.enhance(saturation)
+        im_ = np.array(image) # PIL -> array
+    return np.asarray(im_)
+
+# hue
+def rgb_to_hsv(rgb):
+    """ Input RGB image [0~255] return HSV image [0~1].
+
+    Parameters
+    -------------
+    rgb : should be a numpy arrays with values between 0 and 255.
+    """
+    # Translated from source of colorsys.rgb_to_hsv
+    # r,g,b should be a numpy arrays with values between 0 and 255
+    # rgb_to_hsv returns an array of floats between 0.0 and 1.0.
+    rgb = rgb.astype('float')
+    hsv = np.zeros_like(rgb)
+    # in case an RGBA array was passed, just copy the A channel
+    hsv[..., 3:] = rgb[..., 3:]
+    r, g, b = rgb[..., 0], rgb[..., 1], rgb[..., 2]
+    maxc = np.max(rgb[..., :3], axis=-1)
+    minc = np.min(rgb[..., :3], axis=-1)
+    hsv[..., 2] = maxc
+    mask = maxc != minc
+    hsv[mask, 1] = (maxc - minc)[mask] / maxc[mask]
+    rc = np.zeros_like(r)
+    gc = np.zeros_like(g)
+    bc = np.zeros_like(b)
+    rc[mask] = (maxc - r)[mask] / (maxc - minc)[mask]
+    gc[mask] = (maxc - g)[mask] / (maxc - minc)[mask]
+    bc[mask] = (maxc - b)[mask] / (maxc - minc)[mask]
+    hsv[..., 0] = np.select(
+        [r == maxc, g == maxc], [bc - gc, 2.0 + rc - bc], default=4.0 + gc - rc)
+    hsv[..., 0] = (hsv[..., 0] / 6.0) % 1.0
+    return hsv
+
+def hsv_to_rgb(hsv):
+    """ Input HSV image [0~1] return RGB image [0~255].
+
+    Parameters
+    -------------
+    hsv : should be a numpy arrays with values between 0.0 and 1.0
+    """
+    # Translated from source of colorsys.hsv_to_rgb
+    # h,s should be a numpy arrays with values between 0.0 and 1.0
+    # v should be a numpy array with values between 0.0 and 255.0
+    # hsv_to_rgb returns an array of uints between 0 and 255.
+    rgb = np.empty_like(hsv)
+    rgb[..., 3:] = hsv[..., 3:]
+    h, s, v = hsv[..., 0], hsv[..., 1], hsv[..., 2]
+    i = (h * 6.0).astype('uint8')
+    f = (h * 6.0) - i
+    p = v * (1.0 - s)
+    q = v * (1.0 - s * f)
+    t = v * (1.0 - s * (1.0 - f))
+    i = i % 6
+    conditions = [s == 0.0, i == 1, i == 2, i == 3, i == 4, i == 5]
+    rgb[..., 0] = np.select(conditions, [v, q, p, p, t, v], default=v)
+    rgb[..., 1] = np.select(conditions, [v, v, v, q, p, p], default=t)
+    rgb[..., 2] = np.select(conditions, [v, p, t, v, v, q], default=p)
+    return rgb.astype('uint8')
+
+
+def adjust_hue(im, hout=0.66, is_offset=True, is_clip=True, is_random=False):
+    """ Adjust hue of an RGB image. This is a convenience method that converts an RGB image to float representation, converts it to HSV, add an offset to the hue channel, converts back to RGB and then back to the original data type.
+    For TF, see `tf.image.adjust_hue <https://www.tensorflow.org/api_docs/python/tf/image/adjust_hue>`_ and `tf.image.random_hue <https://www.tensorflow.org/api_docs/python/tf/image/random_hue>`_.
+
+    Parameters
+    -----------
+    im : should be a numpy arrays with values between 0 and 255.
+    hout : float.
+        - If is_offset is False, set all hue values to this value. 0 is red; 0.33 is green; 0.66 is blue.
+        - If is_offset is True, add this value as the offset to the hue channel.
+    is_offset : boolean, default True.
+    is_clip : boolean, default True.
+        - If True, set negative hue values to 0.
+    is_random : boolean, default False.
+
+    Examples
+    ---------
+    - Random, add a random value between -0.2 and 0.2 as the offset to every hue values.
+    >>> im_hue = tl.prepro.adjust_hue(image, hout=0.2, is_offset=True, is_random=False)
+
+    - Non-random, make all hue to green.
+    >>> im_green = tl.prepro.adjust_hue(image, hout=0.66, is_offset=False, is_random=False)
+
+    References
+    -----------
+    - `tf.image.random_hue <https://www.tensorflow.org/api_docs/python/tf/image/random_hue>`_.
+    - `tf.image.adjust_hue <https://www.tensorflow.org/api_docs/python/tf/image/adjust_hue>`_.
+    - `StackOverflow: Changing image hue with python PIL <https://stackoverflow.com/questions/7274221/changing-image-hue-with-python-pil>`_.
+    """
+    hsv = rgb_to_hsv(im)
+    if is_random:
+        hout = np.random.uniform(-hout, hout)
+
+    if is_offset:
+        hsv[...,0] += hout
+    else:
+        hsv[...,0] = hout
+
+    if is_clip:
+        hsv[...,0] = np.clip(hsv[...,0], 0, np.inf)  # Hao : can remove green dots
+
+    rgb = hsv_to_rgb(hsv)
+    return rgb
 
-def constant_multi():
-    #TODO
-    pass
+
+# # contrast
+# def constant(x, cutoff=0.5, gain=10, inv=False, is_random=False):
+#     # TODO
+#     x = exposure.adjust_sigmoid(x, cutoff=cutoff, gain=gain, inv=inv)
+#     return x
+#
+# def constant_multi():
+#     #TODO
+#     pass
 
 # resize
-def imresize(x, size=[100, 100], interp='bilinear', mode=None):
+def imresize(x, size=[100, 100], interp='bicubic', mode=None):
     """Resize an image by given output size and method. Warning, this function
     will rescale the value to [0, 255].
 
@@ -841,6 +1090,36 @@ def imresize(x, size=[100, 100], interp='bilinear', mode=None):
     else:
         raise Exception("Unsupported channel %d" % x.shape[-1])
 
+# value scale
+def pixel_value_scale(im, val=0.9, clip=[], is_random=False):
+    """Scales each value in the pixels of the image.
+
+    Parameters
+    -----------
+    im : numpy array for one image.
+    val : float.
+        - If is_random=False, multiply this value with all pixels.
+        - If is_random=True, multiply a value between [1-val, 1+val] with all pixels.
+
+    Examples
+    ----------
+    - Random
+    >>> im = pixel_value_scale(im, 0.1, [0, 255], is_random=True)
+
+    - Non-random
+    >>> im = pixel_value_scale(im, 0.9, [0, 255], is_random=False)
+    """
+    if is_random:
+        scale = 1 + np.random.uniform(-val, val)
+        im = im * scale
+    else:
+        im = im * val
+
+    if len(clip) == 2:
+        im = np.clip(im, clip[0], clip[1])
+
+    return im
+
 # normailization
 def samplewise_norm(x, rescale=None, samplewise_center=False, samplewise_std_normalization=False,
             channel_index=2, epsilon=1e-7):
@@ -934,11 +1213,11 @@ def zca_whitening(x, principal_components):
         An image with dimension of [row, col, channel] (default).
     principal_components : matrix from ``get_zca_whitening_principal_components_img``.
     """
-    # flatx = np.reshape(x, (x.size))
-    print(principal_components.shape, x.shape)  # ((28160, 28160), (160, 176, 1))
+    flatx = np.reshape(x, (x.size))
+    # print(principal_components.shape, x.shape)  # ((28160, 28160), (160, 176, 1))
     # flatx = np.reshape(x, (x.shape))
     # flatx = np.reshape(x, (x.shape[0], ))
-    print(flatx.shape)  # (160, 176, 1)
+    # print(flatx.shape)  # (160, 176, 1)
     whitex = np.dot(flatx, principal_components)
     x = np.reshape(whitex, (x.shape[0], x.shape[1], x.shape[2]))
     return x
@@ -988,7 +1267,7 @@ def channel_shift(x, intensity, is_random=False, channel_index=2):
     # x = np.rollaxis(x, 0, channel_index+1)
     # return x
 
-def channel_shift_multi(x, intensity, channel_index=2):
+def channel_shift_multi(x, intensity, is_random=False, channel_index=2):
     """Shift the channels of images with the same arguments, randomly or non-randomly, see `numpy.rollaxis <https://docs.scipy.org/doc/numpy/reference/generated/numpy.rollaxis.html>`_ .
     Usually be used for image segmentation which x=[X, Y], X and Y should be matched.
 
@@ -1076,13 +1355,13 @@ def transform_matrix_offset_center(matrix, x, y):
     return transform_matrix
 
 
-def apply_transform(x, transform_matrix, channel_index=2, fill_mode='nearest', cval=0.):
+def apply_transform(x, transform_matrix, channel_index=2, fill_mode='nearest', cval=0., order=1):
     """Return transformed images by given transform_matrix from ``transform_matrix_offset_center``.
 
     Parameters
     ----------
     x : numpy array
-        Batch of images with dimension of 3, [batch_size, row, col, channel].
+        An image with dimension of [row, col, channel] (default).
     transform_matrix : numpy array
         Transform matrix (offset center), can be generated by ``transform_matrix_offset_center``
     channel_index : int
@@ -1093,6 +1372,15 @@ def apply_transform(x, transform_matrix, channel_index=2, fill_mode='nearest', c
         - `scipy ndimage affine_transform <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.ndimage.interpolation.affine_transform.html>`_
     cval : scalar, optional
         Value used for points outside the boundaries of the input if mode='constant'. Default is 0.0
+    order : int, optional
+        The order of interpolation. The order has to be in the range 0-5:
+
+        - 0 Nearest-neighbor
+        - 1 Bi-linear (default)
+        - 2 Bi-quadratic
+        - 3 Bi-cubic
+        - 4 Bi-quartic
+        - 5 Bi-quintic
 
         - `scipy ndimage affine_transform <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.ndimage.interpolation.affine_transform.html>`_
 
@@ -1104,7 +1392,7 @@ def apply_transform(x, transform_matrix, channel_index=2, fill_mode='nearest', c
     final_affine_matrix = transform_matrix[:2, :2]
     final_offset = transform_matrix[:2, 2]
     channel_images = [ndi.interpolation.affine_transform(x_channel, final_affine_matrix,
-                      final_offset, order=0, mode=fill_mode, cval=cval) for x_channel in x]
+                      final_offset, order=order, mode=fill_mode, cval=cval) for x_channel in x]
     x = np.stack(channel_images, axis=0)
     x = np.rollaxis(x, 0, channel_index+1)
     return x
@@ -1118,7 +1406,7 @@ def projective_transform_by_points(x, src, dst, map_args={}, output_shape=None,
     x : numpy array
         An image with dimension of [row, col, channel] (default).
     src : list or numpy
-        The original coordinates, usually 4 coordinates of (x, y).
+        The original coordinates, usually 4 coordinates of (width, height).
     dst : list or numpy
         The coordinates after transformation, the number of coordinates is the same with src.
     map_args : dict, optional
@@ -1146,7 +1434,7 @@ def projective_transform_by_points(x, src, dst, map_args={}, output_shape=None,
     Examples
     --------
     >>> Assume X is an image from CIFAR 10, i.e. shape == (32, 32, 3)
-    >>> src = [[0,0],[0,32],[32,0],[32,32]]
+    >>> src = [[0,0],[0,32],[32,0],[32,32]]     # [w, h]
     >>> dst = [[10,10],[0,32],[32,0],[32,32]]
     >>> x = projective_transform_by_points(X, src, dst)
 
@@ -1252,7 +1540,7 @@ def binary_dilation(x, radius=3):
     """
     from skimage.morphology import disk, binary_dilation
     mask = disk(radius)
-    x = binary_dilation(image, selem=mask)
+    x = binary_dilation(x, selem=mask)
     return x
 
 def dilation(x, radius=3):
@@ -1270,6 +1558,706 @@ def dilation(x, radius=3):
     return x
 
 
+def binary_erosion(x, radius=3):
+    """ Return binary morphological erosion of an image,
+    see `skimage.morphology.binary_erosion <http://scikit-image.org/docs/dev/api/skimage.morphology.html#skimage.morphology.binary_erosion>`_.
+
+    Parameters
+    -----------
+    x : 2D array image.
+    radius : int for the radius of mask.
+    """
+    from skimage.morphology import disk, dilation, binary_erosion
+    mask = disk(radius)
+    x = binary_erosion(x, selem=mask)
+    return x
+
+def erosion(x, radius=3):
+    """ Return greyscale morphological erosion of an image,
+    see `skimage.morphology.erosion <http://scikit-image.org/docs/dev/api/skimage.morphology.html#skimage.morphology.erosion>`_.
+
+    Parameters
+    -----------
+    x : 2D array image.
+    radius : int for the radius of mask.
+    """
+    from skimage.morphology import disk, dilation, erosion
+    mask = disk(radius)
+    x = erosion(x, selem=mask)
+    return x
+
+
+
+## Object Detection
+
+def obj_box_coords_rescale(coords=[], shape=[100, 200]):
+    """Scale down a list of coordinates from pixel unit to the ratio of image size i.e. in the range of [0, 1].
+
+    Parameters
+    ------------
+    coords : list of list for coordinates [[x, y, w, h], [x, y, w, h], ...]
+    shape : list of 2 integers for [height, width] of the image.
+
+    Examples
+    ---------
+    >>> coords = obj_box_coords_rescale(coords=[[30, 40, 50, 50], [10, 10, 20, 20]], shape=[100, 100])
+    >>> print(coords)
+    ... [[0.3, 0.4, 0.5, 0.5], [0.1, 0.1, 0.2, 0.2]]
+    >>> coords = obj_box_coords_rescale(coords=[[30, 40, 50, 50]], shape=[50, 100])
+    >>> print(coords)
+    ... [[0.3, 0.8, 0.5, 1.0]]
+    >>> coords = obj_box_coords_rescale(coords=[[30, 40, 50, 50]], shape=[100, 200])
+    >>> print(coords)
+    ... [[0.15, 0.4, 0.25, 0.5]]
+    """
+    imh, imw = shape[0], shape[1]
+    imh = imh * 1.0 # * 1.0 for python2 : force division to be float point
+    imw = imw * 1.0
+    coords_new = list()
+    for coord in coords:
+        assert len(coord) == 4, "coordinate should be 4 values : [x, y, w, h]"
+        x = coord[0] / imw
+        y = coord[1] / imh
+        w = coord[2] / imw
+        h = coord[3] / imh
+        coords_new.append([x, y, w, h])
+    return coords_new
+
+def obj_box_coord_rescale(coord=[], shape=[100, 200]):
+    """Scale down one coordinates from pixel unit to the ratio of image size i.e. in the range of [0, 1].
+    It is the reverse process of ``obj_box_coord_scale_to_pixelunit``.
+
+    Parameters
+    ------------
+    coords : list of list for coordinates [[x, y, w, h], [x, y, w, h], ...]
+    shape : list of 2 integers for [height, width] of the image.
+
+    Examples
+    ---------
+    >>> coord = obj_box_coord_rescale(coord=[30, 40, 50, 50], shape=[100, 100])
+    ... [[0.3, 0.4, 0.5, 0.5]]
+    """
+    return obj_box_coords_rescale(coords=[coord], shape=shape)[0]
+
+# coord = obj_box_coord_rescale(coord=[30, 40, 50, 50], shape=[100, 100])
+# print(coord) #[[0.15, 0.4, 0.25, 0.5]]
+# exit()
+
+def obj_box_coord_scale_to_pixelunit(coord, shape=(100, 100, 3)):
+    """ Convert one coordinate [x, y, w (or x2), h (or y2)] in ratio format to image coordinate format.
+    It is the reverse process of ``obj_box_coord_rescale``.
+
+    Parameters
+    -----------
+    coord : list of float, [x, y, w (or x2), h (or y2)] in ratio format, i.e value range [0~1].
+    shape : tuple of (height, width, channel (optional))
+
+    Examples
+    ---------
+    >>> x, y, x2, y2 = obj_box_coord_scale_to_pixelunit([0.2, 0.3, 0.5, 0.7], shape=(100, 200, 3))
+    ... (40, 30, 100, 70)
+    """
+    imh, imw = shape[0:2]
+    x  = int(coord[0]*imw)
+    x2 = int(coord[2]*imw)
+    y  = int(coord[1]*imh)
+    y2 = int(coord[3]*imh)
+    return [x, y, x2, y2]
+
+# coords = obj_box_coords_rescale(coords=[[30, 40, 50, 50], [10, 10, 20, 20]], shape=[100, 100])
+# print(coords)
+#     # ... [[0.3, 0.4, 0.5, 0.5], [0.1, 0.1, 0.2, 0.2]]
+# coords = obj_box_coords_rescale(coords=[[30, 40, 50, 50]], shape=[50, 100])
+# print(coords)
+#     # ... [[0.3, 0.8, 0.5, 1.0]]
+# coords = obj_box_coords_rescale(coords=[[30, 40, 50, 50]], shape=[100, 200])
+# print(coords)
+#     # ... [[0.15, 0.4, 0.25, 0.5]]
+# exit()
+
+def obj_box_coord_centroid_to_upleft_butright(coord, to_int=False):
+    """ Convert one coordinate [x_center, y_center, w, h] to [x1, y1, x2, y2] in up-left and botton-right format.
+
+    Examples
+    ---------
+    >>> coord = obj_box_coord_centroid_to_upleft_butright([30, 40, 20, 20])
+    ... [20, 30, 40, 50]
+    """
+    assert len(coord) == 4,  "coordinate should be 4 values : [x, y, w, h]"
+    x_center, y_center, w, h = coord
+    x  = x_center - w / 2.
+    y  = y_center - h / 2.
+    x2 = x + w
+    y2 = y + h
+    if to_int:
+        return [int(x), int(y), int(x2), int(y2)]
+    else:
+        return [x, y, x2, y2]
+
+# coord = obj_box_coord_centroid_to_upleft_butright([30, 40, 20, 20])
+# print(coord)    [20, 30, 40, 50]
+# exit()
+
+def obj_box_coord_upleft_butright_to_centroid(coord):
+    """ Convert one coordinate [x1, y1, x2, y2] to [x_center, y_center, w, h].
+    It is the reverse process of ``obj_box_coord_centroid_to_upleft_butright``.
+    """
+    assert len(coord) == 4,  "coordinate should be 4 values : [x1, y1, x2, y2]"
+    x1, y1, x2, y2 = coord
+    w = x2 - x1
+    h = y2 - y1
+    x_c = x1 + w / 2.
+    y_c = y1 + h / 2.
+    return [x_c, y_c, w, h]
+
+
+def obj_box_coord_centroid_to_upleft(coord):
+    """ Convert one coordinate [x_center, y_center, w, h] to [x, y, w, h].
+    It is the reverse process of ``obj_box_coord_upleft_to_centroid``.
+    """
+    assert len(coord) == 4,  "coordinate should be 4 values : [x, y, w, h]"
+    x_center, y_center, w, h = coord
+    x  = x_center - w / 2.
+    y  = y_center - h / 2.
+    return [x, y, w, h]
+
+def obj_box_coord_upleft_to_centroid(coord):
+    """ Convert one coordinate [x, y, w, h] to [x_center, y_center, w, h].
+    It is the reverse process of ``obj_box_coord_centroid_to_upleft``.
+    """
+    assert len(coord) == 4,  "coordinate should be 4 values : [x, y, w, h]"
+    x, y, w, h = coord
+    x_center = x + w / 2.
+    y_center = y + h / 2.
+    return [x_center, y_center, w, h]
+
+##
+def parse_darknet_ann_str_to_list(annotation):
+    """ Input string format of class, x, y, w, h, return list of list format.
+    """
+    annotation = annotation.split("\n")
+    ann = []
+    for a in annotation:
+        a = a.split()
+        if len(a) == 5:
+            for i in range(len(a)):
+                if i == 0:
+                    a[i] = int(a[i])
+                else:
+                    a[i] = float(a[i])
+            ann.append(a)
+    return ann
+
+def parse_darknet_ann_list_to_cls_box(annotation):
+    """ Input list of [[class, x, y, w, h], ...], return two list of [class ...] and [[x, y, w, h], ...].
+    """
+    class_list = []
+    bbox_list = []
+    for i in range(len(annotation)):
+        class_list.append( annotation[i][0] )
+        bbox_list.append( annotation[i][1:] )
+    return class_list, bbox_list
+
+
+def obj_box_left_right_flip(im, coords=[], is_rescale=False, is_center=False, is_random=False):
+    """Left-right flip the image and coordinates for object detection.
+
+    Parameters
+    ----------
+    im : numpy array
+        An image with dimension of [row, col, channel] (default).
+    coords : list of list for coordinates [[x, y, w, h], [x, y, w, h], ...]
+    is_rescale : boolean, default False
+        Set to True, if the input coordinates are rescaled to [0, 1].
+    is_center : boolean, default False
+        Set to True, if the x and y of coordinates are the centroid. (i.e. darknet format)
+    is_random : boolean, default False
+        If True, randomly flip.
+
+    Examples
+    --------
+    >>> im = np.zeros([80, 100])    # as an image with shape width=100, height=80
+    >>> im, coords = obj_box_left_right_flip(im, coords=[[0.2, 0.4, 0.3, 0.3], [0.1, 0.5, 0.2, 0.3]], is_rescale=True, is_center=True, is_random=False)
+    >>> print(coords)
+    ... [[0.8, 0.4, 0.3, 0.3], [0.9, 0.5, 0.2, 0.3]]
+    >>> im, coords = obj_box_left_right_flip(im, coords=[[0.2, 0.4, 0.3, 0.3]], is_rescale=True, is_center=False, is_random=False)
+    >>> print(coords)
+    ... [[0.5, 0.4, 0.3, 0.3]]
+    >>> im, coords = obj_box_left_right_flip(im, coords=[[20, 40, 30, 30]], is_rescale=False, is_center=True, is_random=False)
+    >>> print(coords)
+    ... [[80, 40, 30, 30]]
+    >>> im, coords = obj_box_left_right_flip(im, coords=[[20, 40, 30, 30]], is_rescale=False, is_center=False, is_random=False)
+    >>> print(coords)
+    ... [[50, 40, 30, 30]]
+    """
+    def _flip(im, coords):
+        im = flip_axis(im, axis=1, is_random=False)
+        coords_new = list()
+
+        for coord in coords:
+            assert len(coord) == 4, "coordinate should be 4 values : [x, y, w, h]"
+            if is_rescale:
+                if is_center:
+                    # x_center' = 1 - x
+                    x = 1. - coord[0]
+                else:
+                    # x_center' = 1 - x - w
+                    x = 1. - coord[0] - coord[2]
+            else:
+                if is_center:
+                    # x' = im.width - x
+                    x = im.shape[1] - coord[0]
+                else:
+                    # x' = im.width - x - w
+                    x = im.shape[1] - coord[0] - coord[2]
+            coords_new.append([x, coord[1], coord[2], coord[3]])
+        return im, coords_new
+
+    if is_random:
+        factor = np.random.uniform(-1, 1)
+        if factor > 0:
+            return _flip(im, coords)
+        else:
+            return im, coords
+    else:
+        return _flip(im, coords)
+
+# im = np.zeros([80, 100])    # as an image with shape width=100, height=80
+# im, coords = obj_box_left_right_flip(im, coords=[[0.2, 0.4, 0.3, 0.3], [0.1, 0.5, 0.2, 0.3]], is_rescale=True, is_center=True, is_random=False)
+# print(coords)
+# # ... [[0.8, 0.4, 0.3, 0.3], [0.9, 0.5, 0.2, 0.3]]
+# im, coords = obj_box_left_right_flip(im, coords=[[0.2, 0.4, 0.3, 0.3]], is_rescale=True, is_center=False, is_random=False)
+# print(coords)
+# # [[0.5, 0.4, 0.3, 0.3]]
+# im, coords = obj_box_left_right_flip(im, coords=[[20, 40, 30, 30]], is_rescale=False, is_center=True, is_random=False)
+# print(coords)
+# # ... [[80, 40, 30, 30]]
+# im, coords = obj_box_left_right_flip(im, coords=[[20, 40, 30, 30]], is_rescale=False, is_center=False, is_random=False)
+# print(coords)
+# # [[50, 40, 30, 30]]
+# exit()
+
+def obj_box_imresize(im, coords=[], size=[100, 100], interp='bicubic', mode=None, is_rescale=False):
+    """Resize an image, and compute the new bounding box coordinates.
+
+    Parameters
+    -------------
+    im : numpy array
+        An image with dimension of [row, col, channel] (default).
+    coords : list of list for coordinates [[x, y, w, h], [x, y, w, h], ...]
+    size, interp, mode : see ``tl.prepro.imresize`` for details.
+    is_rescale : boolean, default False
+        Set to True, if the input coordinates are rescaled to [0, 1], then return the original coordinates.
+
+    Examples
+    --------
+    >>> im = np.zeros([80, 100, 3])    # as an image with shape width=100, height=80
+    >>> _, coords = obj_box_imresize(im, coords=[[20, 40, 30, 30], [10, 20, 20, 20]], size=[160, 200], is_rescale=False)
+    >>> print(coords)
+    ... [[40, 80, 60, 60], [20, 40, 40, 40]]
+    >>> _, coords = obj_box_imresize(im, coords=[[20, 40, 30, 30]], size=[40, 100], is_rescale=False)
+    >>> print(coords)
+    ... [20, 20, 30, 15]
+    >>> _, coords = obj_box_imresize(im, coords=[[20, 40, 30, 30]], size=[60, 150], is_rescale=False)
+    >>> print(coords)
+    ... [30, 30, 45, 22]
+    >>> im2, coords = obj_box_imresize(im, coords=[[0.2, 0.4, 0.3, 0.3]], size=[160, 200], is_rescale=True)
+    >>> print(coords, im2.shape)
+    ... [0.2, 0.4, 0.3, 0.3] (160, 200, 3)
+    """
+    imh, imw = im.shape[0:2]
+    imh = imh * 1.0 # * 1.0 for python2 : force division to be float point
+    imw = imw * 1.0
+    im = imresize(im, size=size, interp=interp, mode=mode)
+
+    if is_rescale is False:
+        coords_new = list()
+        for coord in coords:
+            assert len(coord) == 4, "coordinate should be 4 values : [x, y, w, h]"
+            # x' = x * (imw'/imw)
+            x = int(coord[0] * (size[1]/imw))
+            # y' = y * (imh'/imh)
+            # print('>>', coord[1], size[0], imh)
+            y = int(coord[1] * (size[0]/imh))
+            # w' = w * (imw'/imw)
+            w = int(coord[2] * (size[1]/imw))
+            # h' = h * (imh'/imh)
+            h = int(coord[3] * (size[0]/imh))
+            coords_new.append([x, y, w, h])
+        return im, coords_new
+    else:
+        return im, coords
+
+# im = np.zeros([80, 100, 3])    # as an image with shape width=100, height=80
+# _, coords = obj_box_imresize(im, coords=[[20, 40, 30, 30], [10, 20, 20, 20]], size=[160, 200], is_rescale=False)
+# print(coords)
+# # ... [[40, 80, 60, 60], [20, 40, 40, 40]]
+# _, coords = obj_box_imresize(im, coords=[[20, 40, 30, 30]], size=[40, 100], is_rescale=False)
+# print(coords)
+# # ... [20, 20, 30, 15]
+# _, coords = obj_box_imresize(im, coords=[[20, 40, 30, 30]], size=[60, 150], is_rescale=False)
+# print(coords)
+# # ... [30, 30, 45, 22]
+# im2, coords = obj_box_imresize(im, coords=[[0.2, 0.4, 0.3, 0.3]], size=[160, 200], is_rescale=True)
+# print(coords, im2.shape)
+# # ... [0.2, 0.4, 0.3, 0.3] (160, 200, 3)
+# exit()
+
+def obj_box_crop(im, classes=[], coords=[], wrg=100, hrg=100,
+    is_rescale=False, is_center=False, is_random=False,
+    thresh_wh=0.02, thresh_wh2=12.):
+    """Randomly or centrally crop an image, and compute the new bounding box coordinates.
+    Objects outside the cropped image will be removed.
+
+    Parameters
+    -----------
+    im : numpy array
+        An image with dimension of [row, col, channel] (default).
+    classes : list of class ID (int).
+    coords : list of list for coordinates [[x, y, w, h], [x, y, w, h], ...]
+    wrg, hrg, is_random : see ``tl.prepro.crop`` for details.
+    is_rescale : boolean, default False
+        Set to True, if the input coordinates are rescaled to [0, 1].
+    is_center : boolean, default False
+        Set to True, if the x and y of coordinates are the centroid. (i.e. darknet format)
+    thresh_wh : float
+        Threshold, remove the box if its ratio of width(height) to image size less than the threshold.
+    thresh_wh2 : float
+        Threshold, remove the box if its ratio of width to height or vice verse higher than the threshold.
+    """
+    h, w = im.shape[0], im.shape[1]
+    assert (h > hrg) and (w > wrg), "The size of cropping should smaller than the original image"
+    if is_random:
+        h_offset = int(np.random.uniform(0, h-hrg) -1)
+        w_offset = int(np.random.uniform(0, w-wrg) -1)
+        h_end = hrg + h_offset
+        w_end = wrg + w_offset
+        im_new = im[h_offset: h_end ,w_offset: w_end]
+    else:   # central crop
+        h_offset = int(np.floor((h - hrg)/2.))
+        w_offset = int(np.floor((w - wrg)/2.))
+        h_end = h_offset + hrg
+        w_end = w_offset + wrg
+        im_new = im[h_offset: h_end, w_offset: w_end]
+
+    #              w
+    #   _____________________________
+    #   |  h/w offset               |
+    #   |       -------             |
+    # h |       |     |             |
+    #   |       |     |             |
+    #   |       -------             |
+    #   |            h/w end        |
+    #   |___________________________|
+
+    def _get_coord(coord):
+        """ Input pixel-unit [x, y, w, h] format, then make sure [x, y] it is the up-left coordinates,
+        before getting the new coordinates.
+        Boxes outsides the cropped image will be removed.
+        """
+        if is_center:
+            coord = obj_box_coord_centroid_to_upleft(coord)
+
+        ##======= pixel unit format and upleft, w, h ==========##
+
+        # x = np.clip( coord[0] - w_offset, 0, w_end - w_offset)
+        # y = np.clip( coord[1] - h_offset, 0, h_end - h_offset)
+        # w = np.clip( coord[2]           , 0, w_end - w_offset)
+        # h = np.clip( coord[3]           , 0, h_end - h_offset)
+
+        x = coord[0] - w_offset
+        y = coord[1] - h_offset
+        w = coord[2]
+        h = coord[3]
+
+        if x < 0:
+            if x + w <= 0:
+                return None
+            w = w + x
+            x = 0
+        elif x > im_new.shape[1]:   # object outside the cropped image
+            return None
+
+        if y < 0:
+            if y + h <= 0:
+                return None
+            h = h + y
+            y = 0
+        elif y > im_new.shape[0]:   # object outside the cropped image
+            return None
+
+        if (x is not None) and (x + w > im_new.shape[1]):   # box outside the cropped image
+            w = im_new.shape[1] - x
+
+        if (y is not None) and (y + h > im_new.shape[0]):   # box outside the cropped image
+            h = im_new.shape[0] - y
+
+        if (w / (h+1.) > thresh_wh2) or (h / (w+1.) > thresh_wh2):           # object shape strange: too narrow
+            # print('xx', w, h)
+            return None
+
+        if (w / (im_new.shape[1]*1.) < thresh_wh) or (h / (im_new.shape[0]*1.) < thresh_wh):    # object shape strange: too narrow
+            # print('yy', w, im_new.shape[1], h, im_new.shape[0])
+            return None
+
+        coord = [x, y, w, h]
+
+        ## convert back if input format is center.
+        if is_center:
+            coord = obj_box_coord_upleft_to_centroid(coord)
+
+        return coord
+
+    coords_new = list()
+    classes_new = list()
+    for i in range(len(coords)):
+        coord = coords[i]
+        assert len(coord) == 4, "coordinate should be 4 values : [x, y, w, h]"
+        if is_rescale:
+            """ for scaled coord, upscaled before process and scale back in the end. """
+            coord = obj_box_coord_scale_to_pixelunit(coord, im.shape)
+            coord = _get_coord(coord)
+            if coord is not None:
+                coord = obj_box_coord_rescale(coord, im_new.shape)
+                coords_new.append(coord)
+                classes_new.append(classes[i])
+        else:
+            coord = _get_coord(coord)
+            if coord is not None:
+                coords_new.append(coord)
+                classes_new.append(classes[i])
+    return im_new, classes_new, coords_new
+
+def obj_box_shift(im, classes=[], coords=[], wrg=0.1, hrg=0.1,
+    row_index=0, col_index=1, channel_index=2,
+    fill_mode='nearest', cval=0., order=1,
+    is_rescale=False, is_center=False, is_random=False,
+    thresh_wh=0.02, thresh_wh2=12.):
+    """ Shift an image randomly or non-randomly, and compute the new bounding box coordinates.
+    Objects outside the cropped image will be removed.
+
+    Parameters
+    -----------
+    im : numpy array
+        An image with dimension of [row, col, channel] (default).
+    classes : list of class ID (int).
+    coords : list of list for coordinates [[x, y, w, h], [x, y, w, h], ...]
+    wrg, hrg, row_index, col_index, channel_index, is_random, fill_mode, cval, order : see ``tl.prepro.shift``.
+    is_rescale : boolean, default False
+        Set to True, if the input coordinates are rescaled to [0, 1].
+    is_center : boolean, default False
+        Set to True, if the x and y of coordinates are the centroid. (i.e. darknet format)
+    thresh_wh : float
+        Threshold, remove the box if its ratio of width(height) to image size less than the threshold.
+    thresh_wh2 : float
+        Threshold, remove the box if its ratio of width to height or vice verse higher than the threshold.
+    """
+    imh, imw = im.shape[row_index], im.shape[col_index]
+    assert (hrg < 1.0) and (hrg > 0.) and (wrg < 1.0) and (wrg > 0.) , "shift range should be (0, 1)"
+    if is_random:
+        tx = np.random.uniform(-hrg, hrg) * imh
+        ty = np.random.uniform(-wrg, wrg) * imw
+    else:
+        tx, ty = hrg * imh, wrg * imw
+    translation_matrix = np.array([[1, 0, tx],
+                                   [0, 1, ty],
+                                   [0, 0, 1]])
+
+    transform_matrix = translation_matrix  # no need to do offset
+    im_new = apply_transform(im, transform_matrix, channel_index, fill_mode, cval, order)
+
+    # modified from obj_box_crop
+    def _get_coord(coord):
+        """ Input pixel-unit [x, y, w, h] format, then make sure [x, y] it is the up-left coordinates,
+        before getting the new coordinates.
+        Boxes outsides the cropped image will be removed.
+        """
+        if is_center:
+            coord = obj_box_coord_centroid_to_upleft(coord)
+
+        ##======= pixel unit format and upleft, w, h ==========##
+        x = coord[0] - ty   # only change this
+        y = coord[1] - tx   # only change this
+        w = coord[2]
+        h = coord[3]
+
+        if x < 0:
+            if x + w <= 0:
+                return None
+            w = w + x
+            x = 0
+        elif x > im_new.shape[1]:   # object outside the cropped image
+            return None
+
+        if y < 0:
+            if y + h <= 0:
+                return None
+            h = h + y
+            y = 0
+        elif y > im_new.shape[0]:   # object outside the cropped image
+            return None
+
+        if (x is not None) and (x + w > im_new.shape[1]):   # box outside the cropped image
+            w = im_new.shape[1] - x
+
+        if (y is not None) and (y + h > im_new.shape[0]):   # box outside the cropped image
+            h = im_new.shape[0] - y
+
+        if (w / (h+1.) > thresh_wh2) or (h / (w+1.) > thresh_wh2):           # object shape strange: too narrow
+            # print('xx', w, h)
+            return None
+
+        if (w / (im_new.shape[1]*1.) < thresh_wh) or (h / (im_new.shape[0]*1.) < thresh_wh):    # object shape strange: too narrow
+            # print('yy', w, im_new.shape[1], h, im_new.shape[0])
+            return None
+
+        coord = [x, y, w, h]
+
+        ## convert back if input format is center.
+        if is_center:
+            coord = obj_box_coord_upleft_to_centroid(coord)
+
+        return coord
+
+    coords_new = list()
+    classes_new = list()
+    for i in range(len(coords)):
+        coord = coords[i]
+        assert len(coord) == 4, "coordinate should be 4 values : [x, y, w, h]"
+        if is_rescale:
+            """ for scaled coord, upscaled before process and scale back in the end. """
+            coord = obj_box_coord_scale_to_pixelunit(coord, im.shape)
+            coord = _get_coord(coord)
+            if coord is not None:
+                coord = obj_box_coord_rescale(coord, im_new.shape)
+                coords_new.append(coord)
+                classes_new.append(classes[i])
+        else:
+            coord = _get_coord(coord)
+            if coord is not None:
+                coords_new.append(coord)
+                classes_new.append(classes[i])
+    return im_new, classes_new, coords_new
+
+def obj_box_zoom(im, classes=[], coords=[], zoom_range=(0.9, 1.1),
+    row_index=0, col_index=1, channel_index=2, fill_mode='nearest', cval=0., order=1,
+    is_rescale=False, is_center=False, is_random=False,
+    thresh_wh=0.02, thresh_wh2=12.):
+    """Zoom in and out of a single image, randomly or non-randomly, and compute the new bounding box coordinates.
+    Objects outside the cropped image will be removed.
+
+    Parameters
+    -----------
+    im : numpy array
+        An image with dimension of [row, col, channel] (default).
+    classes : list of class ID (int).
+    coords : list of list for coordinates [[x, y, w, h], [x, y, w, h], ...]
+    zoom_range, row_index, col_index, channel_index, is_random, fill_mode, cval, order : see ``tl.prepro.zoom``.
+    is_rescale : boolean, default False
+        Set to True, if the input coordinates are rescaled to [0, 1].
+    is_center : boolean, default False
+        Set to True, if the x and y of coordinates are the centroid. (i.e. darknet format)
+    thresh_wh : float
+        Threshold, remove the box if its ratio of width(height) to image size less than the threshold.
+    thresh_wh2 : float
+        Threshold, remove the box if its ratio of width to height or vice verse higher than the threshold.
+    """
+    if len(zoom_range) != 2:
+        raise Exception('zoom_range should be a tuple or list of two floats. '
+                        'Received arg: ', zoom_range)
+    if is_random:
+        if zoom_range[0] == 1 and zoom_range[1] == 1:
+            zx, zy = 1, 1
+            print(" random_zoom : not zoom in/out")
+        else:
+            zx, zy = np.random.uniform(zoom_range[0], zoom_range[1], 2)
+    else:
+        zx, zy = zoom_range
+    # print(zx, zy)
+    zoom_matrix = np.array([[zx, 0, 0],
+                            [0, zy, 0],
+                            [0, 0, 1]])
+
+    h, w = im.shape[row_index], im.shape[col_index]
+    transform_matrix = transform_matrix_offset_center(zoom_matrix, h, w)
+    im_new = apply_transform(im, transform_matrix, channel_index, fill_mode, cval, order)
+
+
+    # modified from obj_box_crop
+    def _get_coord(coord):
+        """ Input pixel-unit [x, y, w, h] format, then make sure [x, y] it is the up-left coordinates,
+        before getting the new coordinates.
+        Boxes outsides the cropped image will be removed.
+        """
+        if is_center:
+            coord = obj_box_coord_centroid_to_upleft(coord)
+
+        ##======= pixel unit format and upleft, w, h ==========##
+        x = (coord[0] - im.shape[1]/2) / zy + im.shape[1]/2   # only change this
+        y = (coord[1] - im.shape[0]/2) / zx + im.shape[0]/2  # only change this
+        w = coord[2] / zy   # only change this
+        h = coord[3] / zx   # only change thisS
+
+        if x < 0:
+            if x + w <= 0:
+                return None
+            w = w + x
+            x = 0
+        elif x > im_new.shape[1]:   # object outside the cropped image
+            return None
+
+        if y < 0:
+            if y + h <= 0:
+                return None
+            h = h + y
+            y = 0
+        elif y > im_new.shape[0]:   # object outside the cropped image
+            return None
+
+        if (x is not None) and (x + w > im_new.shape[1]):   # box outside the cropped image
+            w = im_new.shape[1] - x
+
+        if (y is not None) and (y + h > im_new.shape[0]):   # box outside the cropped image
+            h = im_new.shape[0] - y
+
+        if (w / (h+1.) > thresh_wh2) or (h / (w+1.) > thresh_wh2):           # object shape strange: too narrow
+            # print('xx', w, h)
+            return None
+
+        if (w / (im_new.shape[1]*1.) < thresh_wh) or (h / (im_new.shape[0]*1.) < thresh_wh):    # object shape strange: too narrow
+            # print('yy', w, im_new.shape[1], h, im_new.shape[0])
+            return None
+
+        coord = [x, y, w, h]
+
+        ## convert back if input format is center.
+        if is_center:
+            coord = obj_box_coord_upleft_to_centroid(coord)
+
+        return coord
+
+    coords_new = list()
+    classes_new = list()
+    for i in range(len(coords)):
+        coord = coords[i]
+        assert len(coord) == 4, "coordinate should be 4 values : [x, y, w, h]"
+        if is_rescale:
+            """ for scaled coord, upscaled before process and scale back in the end. """
+            coord = obj_box_coord_scale_to_pixelunit(coord, im.shape)
+            coord = _get_coord(coord)
+            if coord is not None:
+                coord = obj_box_coord_rescale(coord, im_new.shape)
+                coords_new.append(coord)
+                classes_new.append(classes[i])
+        else:
+            coord = _get_coord(coord)
+            if coord is not None:
+                coords_new.append(coord)
+                classes_new.append(classes[i])
+    return im_new, classes_new, coords_new
+
+
+
 
 
 ## Sequence
@@ -1342,7 +2330,34 @@ def pad_sequences(sequences, maxlen=None, dtype='int32', padding='post', truncat
             x[idx, -len(trunc):] = trunc
         else:
             raise ValueError('Padding type "%s" not understood' % padding)
-    return x
+    return x.tolist()
+
+def remove_pad_sequences(sequences, pad_id=0):
+    """Remove padding.
+
+    Parameters
+    -----------
+    sequences : list of list.
+    pad_id : int.
+
+    Examples
+    ----------
+    >>> sequences = [[2,3,4,0,0], [5,1,2,3,4,0,0,0], [4,5,0,2,4,0,0,0]]
+    >>> print(remove_pad_sequences(sequences, pad_id=0))
+    ... [[2, 3, 4], [5, 1, 2, 3, 4], [4, 5, 0, 2, 4]]
+    """
+    import copy
+    sequences_out = copy.deepcopy(sequences)
+    for i in range(len(sequences)):
+        # for j in range(len(sequences[i])):
+        #     if sequences[i][j] == pad_id:
+        #         sequences_out[i] = sequences_out[i][:j]
+        #         break
+        for j in range(1, len(sequences[i])):
+            if sequences[i][-j] != pad_id:
+                sequences_out[i] = sequences_out[i][0:-j+1]
+                break
+    return sequences_out
 
 def process_sequences(sequences, end_id=0, pad_val=0, is_shorten=True, remain_end_id=False):
     """Set all tokens(ids) after END token to the padding value, and then shorten (option) it to the maximum sequence length in this batch.
@@ -1409,6 +2424,63 @@ def sequences_add_start_id(sequences, start_id=0, remove_last=False):
             sequences_out[i] = [start_id] + sequences[i]
     return sequences_out
 
+def sequences_add_end_id(sequences, end_id=888):
+    """Add special end token(id) in the end of each sequence.
+
+    Parameters
+    -----------
+    sequences : list of list.
+    end_id : int.
+
+    Examples
+    ---------
+    >>> sequences = [[1,2,3],[4,5,6,7]]
+    >>> print(sequences_add_end_id(sequences, end_id=999))
+    ... [[1, 2, 3, 999], [4, 5, 6, 999]]
+    """
+    sequences_out = [[] for _ in range(len(sequences))]#[[]] * len(sequences)
+    for i in range(len(sequences)):
+        sequences_out[i] = sequences[i] + [end_id]
+    return sequences_out
+
+
+def sequences_add_end_id_after_pad(sequences, end_id=888, pad_id=0):
+    """Add special end token(id) in the end of each sequence.
+
+    Parameters
+    -----------
+    sequences : list of list.
+    end_id : int.
+    pad_id : int.
+
+    Examples
+    ---------
+    >>> sequences = [[1,2,0,0], [1,2,3,0], [1,2,3,4]]
+    >>> print(sequences_add_end_id_after_pad(sequences, end_id=99, pad_id=0))
+    ... [[1, 2, 99, 0], [1, 2, 3, 99], [1, 2, 3, 4]]
+    """
+    # sequences_out = [[] for _ in range(len(sequences))]#[[]] * len(sequences)
+    import copy
+    sequences_out = copy.deepcopy(sequences)
+    # # add a pad to all
+    # for i in range(len(sequences)):
+    #     for j in range(len(sequences[i])):
+    #         sequences_out[i].append(pad_id)
+    # # pad -- > end
+    # max_len = 0
+    for i in range(len(sequences)):
+        for j in range(len(sequences[i])):
+            if sequences[i][j] == pad_id:
+                sequences_out[i][j] = end_id
+                # if j > max_len:
+                #     max_len = j
+                break
+    # # remove pad if too long
+    # for i in range(len(sequences)):
+    #     for j in range(len(sequences[i])):
+    #         sequences_out[i] = sequences_out[i][:max_len+1]
+    return sequences_out
+
 def sequences_get_mask(sequences, pad_val=0):
     """Return mask for sequences.
 
@@ -1435,171 +2507,165 @@ def sequences_get_mask(sequences, pad_val=0):
 
 
 ## Tensor Opt
-def distorted_images(images=None, height=24, width=24):
-    """Distort images for generating more training data.
-
-    Features
-    ---------
-    They are cropped to height * width pixels randomly.
-
-    They are approximately whitened to make the model insensitive to dynamic range.
-
-    Randomly flip the image from left to right.
-
-    Randomly distort the image brightness.
-
-    Randomly distort the image contrast.
-
-    Whiten (Normalize) the images.
-
-    Parameters
-    ----------
-    images : 4D Tensor
-        The tensor or placeholder of images
-    height : int
-        The height for random crop.
-    width : int
-        The width for random crop.
-
-    Returns
-    -------
-    result : tuple of Tensor
-        (Tensor for distorted images, Tensor for while loop index)
-
-    Examples
-    --------
-    >>> X_train, y_train, X_test, y_test = tl.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=False)
-    >>> sess = tf.InteractiveSession()
-    >>> batch_size = 128
-    >>> x = tf.placeholder(tf.float32, shape=[batch_size, 32, 32, 3])
-    >>> distorted_images_op = tl.preprocess.distorted_images(images=x, height=24, width=24)
-    >>> sess.run(tf.initialize_all_variables())
-    >>> feed_dict={x: X_train[0:batch_size,:,:,:]}
-    >>> distorted_images, idx = sess.run(distorted_images_op, feed_dict=feed_dict)
-    >>> tl.visualize.images2d(X_train[0:9,:,:,:], second=2, saveable=False, name='cifar10', dtype=np.uint8, fig_idx=20212)
-    >>> tl.visualize.images2d(distorted_images[1:10,:,:,:], second=10, saveable=False, name='distorted_images', dtype=None, fig_idx=23012)
-
-    Notes
-    ------
-    - The first image in 'distorted_images' should be removed.
-
-    References
-    -----------
-    - `tensorflow.models.image.cifar10.cifar10_input <https://github.com/tensorflow/tensorflow/blob/r0.9/tensorflow/models/image/cifar10/cifar10_input.py>`_
-    """
-    print("This function is deprecated, please use tf.map_fn instead, e.g:\n   \
-            t_image = tf.map_fn(lambda img: tf.image.random_brightness(img, max_delta=32. / 255.), t_image)\n \
-            t_image = tf.map_fn(lambda img: tf.image.random_contrast(img, lower=0.5, upper=1.5), t_image)\n \
-            t_image = tf.map_fn(lambda img: tf.image.random_saturation(img, lower=0.5, upper=1.5), t_image)\n \
-            t_image = tf.map_fn(lambda img: tf.image.random_hue(img, max_delta=0.032), t_image)")
-    exit()
-    # print(" [Warning] distorted_images will be deprecated due to speed, see TFRecord tutorial for more info...")
-    try:
-        batch_size = int(images._shape[0])
-    except:
-        raise Exception('unknow batch_size of images')
-    distorted_x = tf.Variable(tf.constant(0.1, shape=[1, height, width, 3]))
-    i = tf.Variable(tf.constant(0))
-
-    c = lambda distorted_x, i: tf.less(i, batch_size)
-
-    def body(distorted_x, i):
-        # 1. Randomly crop a [height, width] section of the image.
-        image = tf.random_crop(tf.gather(images, i), [height, width, 3])
-        # 2. Randomly flip the image horizontally.
-        image = tf.image.random_flip_left_right(image)
-        # 3. Randomly change brightness.
-        image = tf.image.random_brightness(image, max_delta=63)
-        # 4. Randomly change contrast.
-        image = tf.image.random_contrast(image, lower=0.2, upper=1.8)
-        # 5. Subtract off the mean and divide by the variance of the pixels.
-        image = tf.image.per_image_whitening(image)
-        # 6. Append the image to a batch.
-        image = tf.expand_dims(image, 0)
-        return tf.concat(0, [distorted_x, image]), tf.add(i, 1)
-
-    result = tf.while_loop(cond=c, body=body, loop_vars=(distorted_x, i), parallel_iterations=16)
-    return result
-
-
-def crop_central_whiten_images(images=None, height=24, width=24):
-    """Crop the central of image, and normailize it for test data.
-
-    They are cropped to central of height * width pixels.
-
-    Whiten (Normalize) the images.
-
-    Parameters
-    ----------
-    images : 4D Tensor
-        The tensor or placeholder of images
-    height : int
-        The height for central crop.
-    width : int
-        The width for central crop.
-
-    Returns
-    -------
-    result : tuple Tensor
-        (Tensor for distorted images, Tensor for while loop index)
-
-    Examples
-    --------
-    >>> X_train, y_train, X_test, y_test = tl.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=False)
-    >>> sess = tf.InteractiveSession()
-    >>> batch_size = 128
-    >>> x = tf.placeholder(tf.float32, shape=[batch_size, 32, 32, 3])
-    >>> central_images_op = tl.preprocess.crop_central_whiten_images(images=x, height=24, width=24)
-    >>> sess.run(tf.initialize_all_variables())
-    >>> feed_dict={x: X_train[0:batch_size,:,:,:]}
-    >>> central_images, idx = sess.run(central_images_op, feed_dict=feed_dict)
-    >>> tl.visualize.images2d(X_train[0:9,:,:,:], second=2, saveable=False, name='cifar10', dtype=np.uint8, fig_idx=20212)
-    >>> tl.visualize.images2d(central_images[1:10,:,:,:], second=10, saveable=False, name='central_images', dtype=None, fig_idx=23012)
-
-    Notes
-    ------
-    The first image in 'central_images' should be removed.
-
-    Code References
-    ----------------
-    - ``tensorflow.models.image.cifar10.cifar10_input``
-    """
-    print("This function is deprecated, please use tf.map_fn instead, e.g:\n   \
-            t_image = tf.map_fn(lambda img: tf.image.random_brightness(img, max_delta=32. / 255.), t_image)\n \
-            t_image = tf.map_fn(lambda img: tf.image.random_contrast(img, lower=0.5, upper=1.5), t_image)\n \
-            t_image = tf.map_fn(lambda img: tf.image.random_saturation(img, lower=0.5, upper=1.5), t_image)\n \
-            t_image = tf.map_fn(lambda img: tf.image.random_hue(img, max_delta=0.032), t_image)")
-    exit()
-    # print(" [Warning] crop_central_whiten_images will be deprecated due to speed, see TFRecord tutorial for more info...")
-    try:
-        batch_size = int(images._shape[0])
-    except:
-        raise Exception('unknow batch_size of images')
-    central_x = tf.Variable(tf.constant(0.1, shape=[1, height, width, 3]))
-    i = tf.Variable(tf.constant(0))
-
-    c = lambda central_x, i: tf.less(i, batch_size)
-
-    def body(central_x, i):
-        # 1. Crop the central [height, width] of the image.
-        image = tf.image.resize_image_with_crop_or_pad(tf.gather(images, i), height, width)
-        # 2. Subtract off the mean and divide by the variance of the pixels.
-        image = tf.image.per_image_whitening(image)
-        # 5. Append the image to a batch.
-        image = tf.expand_dims(image, 0)
-        return tf.concat(0, [central_x, image]), tf.add(i, 1)
-
-    result = tf.while_loop(cond=c, body=body, loop_vars=(central_x, i), parallel_iterations=16)
-    return result
-
-
-
-
-
-
-
-
-
+# def distorted_images(images=None, height=24, width=24):
+#     """Distort images for generating more training data.
+#
+#     Features
+#     ---------
+#     They are cropped to height * width pixels randomly.
+#
+#     They are approximately whitened to make the model insensitive to dynamic range.
+#
+#     Randomly flip the image from left to right.
+#
+#     Randomly distort the image brightness.
+#
+#     Randomly distort the image contrast.
+#
+#     Whiten (Normalize) the images.
+#
+#     Parameters
+#     ----------
+#     images : 4D Tensor
+#         The tensor or placeholder of images
+#     height : int
+#         The height for random crop.
+#     width : int
+#         The width for random crop.
+#
+#     Returns
+#     -------
+#     result : tuple of Tensor
+#         (Tensor for distorted images, Tensor for while loop index)
+#
+#     Examples
+#     --------
+#     >>> X_train, y_train, X_test, y_test = tl.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=False)
+#     >>> sess = tf.InteractiveSession()
+#     >>> batch_size = 128
+#     >>> x = tf.placeholder(tf.float32, shape=[batch_size, 32, 32, 3])
+#     >>> distorted_images_op = tl.preprocess.distorted_images(images=x, height=24, width=24)
+#     >>> sess.run(tf.initialize_all_variables())
+#     >>> feed_dict={x: X_train[0:batch_size,:,:,:]}
+#     >>> distorted_images, idx = sess.run(distorted_images_op, feed_dict=feed_dict)
+#     >>> tl.visualize.images2d(X_train[0:9,:,:,:], second=2, saveable=False, name='cifar10', dtype=np.uint8, fig_idx=20212)
+#     >>> tl.visualize.images2d(distorted_images[1:10,:,:,:], second=10, saveable=False, name='distorted_images', dtype=None, fig_idx=23012)
+#
+#     Notes
+#     ------
+#     - The first image in 'distorted_images' should be removed.
+#
+#     References
+#     -----------
+#     - `tensorflow.models.image.cifar10.cifar10_input <https://github.com/tensorflow/tensorflow/blob/r0.9/tensorflow/models/image/cifar10/cifar10_input.py>`_
+#     """
+#     print("This function is deprecated, please use tf.map_fn instead, e.g:\n   \
+#             t_image = tf.map_fn(lambda img: tf.image.random_brightness(img, max_delta=32. / 255.), t_image)\n \
+#             t_image = tf.map_fn(lambda img: tf.image.random_contrast(img, lower=0.5, upper=1.5), t_image)\n \
+#             t_image = tf.map_fn(lambda img: tf.image.random_saturation(img, lower=0.5, upper=1.5), t_image)\n \
+#             t_image = tf.map_fn(lambda img: tf.image.random_hue(img, max_delta=0.032), t_image)")
+#     exit()
+#     # print(" [Warning] distorted_images will be deprecated due to speed, see TFRecord tutorial for more info...")
+#     try:
+#         batch_size = int(images._shape[0])
+#     except:
+#         raise Exception('unknow batch_size of images')
+#     distorted_x = tf.Variable(tf.constant(0.1, shape=[1, height, width, 3]))
+#     i = tf.Variable(tf.constant(0))
+#
+#     c = lambda distorted_x, i: tf.less(i, batch_size)
+#
+#     def body(distorted_x, i):
+#         # 1. Randomly crop a [height, width] section of the image.
+#         image = tf.random_crop(tf.gather(images, i), [height, width, 3])
+#         # 2. Randomly flip the image horizontally.
+#         image = tf.image.random_flip_left_right(image)
+#         # 3. Randomly change brightness.
+#         image = tf.image.random_brightness(image, max_delta=63)
+#         # 4. Randomly change contrast.
+#         image = tf.image.random_contrast(image, lower=0.2, upper=1.8)
+#         # 5. Subtract off the mean and divide by the variance of the pixels.
+#         image = tf.image.per_image_whitening(image)
+#         # 6. Append the image to a batch.
+#         image = tf.expand_dims(image, 0)
+#         return tf.concat(0, [distorted_x, image]), tf.add(i, 1)
+#
+#     result = tf.while_loop(cond=c, body=body, loop_vars=(distorted_x, i), parallel_iterations=16)
+#     return result
+#
+#
+# def crop_central_whiten_images(images=None, height=24, width=24):
+#     """Crop the central of image, and normailize it for test data.
+#
+#     They are cropped to central of height * width pixels.
+#
+#     Whiten (Normalize) the images.
+#
+#     Parameters
+#     ----------
+#     images : 4D Tensor
+#         The tensor or placeholder of images
+#     height : int
+#         The height for central crop.
+#     width : int
+#         The width for central crop.
+#
+#     Returns
+#     -------
+#     result : tuple Tensor
+#         (Tensor for distorted images, Tensor for while loop index)
+#
+#     Examples
+#     --------
+#     >>> X_train, y_train, X_test, y_test = tl.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=False)
+#     >>> sess = tf.InteractiveSession()
+#     >>> batch_size = 128
+#     >>> x = tf.placeholder(tf.float32, shape=[batch_size, 32, 32, 3])
+#     >>> central_images_op = tl.preprocess.crop_central_whiten_images(images=x, height=24, width=24)
+#     >>> sess.run(tf.initialize_all_variables())
+#     >>> feed_dict={x: X_train[0:batch_size,:,:,:]}
+#     >>> central_images, idx = sess.run(central_images_op, feed_dict=feed_dict)
+#     >>> tl.visualize.images2d(X_train[0:9,:,:,:], second=2, saveable=False, name='cifar10', dtype=np.uint8, fig_idx=20212)
+#     >>> tl.visualize.images2d(central_images[1:10,:,:,:], second=10, saveable=False, name='central_images', dtype=None, fig_idx=23012)
+#
+#     Notes
+#     ------
+#     The first image in 'central_images' should be removed.
+#
+#     Code References
+#     ----------------
+#     - ``tensorflow.models.image.cifar10.cifar10_input``
+#     """
+#     print("This function is deprecated, please use tf.map_fn instead, e.g:\n   \
+#             t_image = tf.map_fn(lambda img: tf.image.random_brightness(img, max_delta=32. / 255.), t_image)\n \
+#             t_image = tf.map_fn(lambda img: tf.image.random_contrast(img, lower=0.5, upper=1.5), t_image)\n \
+#             t_image = tf.map_fn(lambda img: tf.image.random_saturation(img, lower=0.5, upper=1.5), t_image)\n \
+#             t_image = tf.map_fn(lambda img: tf.image.random_hue(img, max_delta=0.032), t_image)")
+#     exit()
+#     # print(" [Warning] crop_central_whiten_images will be deprecated due to speed, see TFRecord tutorial for more info...")
+#     try:
+#         batch_size = int(images._shape[0])
+#     except:
+#         raise Exception('unknow batch_size of images')
+#     central_x = tf.Variable(tf.constant(0.1, shape=[1, height, width, 3]))
+#     i = tf.Variable(tf.constant(0))
+#
+#     c = lambda central_x, i: tf.less(i, batch_size)
+#
+#     def body(central_x, i):
+#         # 1. Crop the central [height, width] of the image.
+#         image = tf.image.resize_image_with_crop_or_pad(tf.gather(images, i), height, width)
+#         # 2. Subtract off the mean and divide by the variance of the pixels.
+#         image = tf.image.per_image_whitening(image)
+#         # 5. Append the image to a batch.
+#         image = tf.expand_dims(image, 0)
+#         return tf.concat(0, [central_x, image]), tf.add(i, 1)
+#
+#     result = tf.while_loop(cond=c, body=body, loop_vars=(central_x, i), parallel_iterations=16)
+#     return result
+#
+#
+#
 
 
 
diff --git a/tensorlayer/rein.py b/tensorlayer/rein.py
index 9ad3de7c..f37561e4 100644
--- a/tensorlayer/rein.py
+++ b/tensorlayer/rein.py
@@ -1,5 +1,5 @@
 #! /usr/bin/python
-# -*- coding: utf8 -*-
+# -*- coding: utf-8 -*-
 
 
 
@@ -60,26 +60,75 @@ def cross_entropy_reward_loss(logits, actions, rewards, name=None):
 
     Examples
     ----------
-    >>> states_batch_pl = tf.placeholder(tf.float32, shape=[None, D])   # observation for training
-    >>> network = tl.layers.InputLayer(states_batch_pl, name='input_layer')
-    >>> network = tl.layers.DenseLayer(network, n_units=H, act = tf.nn.relu, name='relu1')
-    >>> network = tl.layers.DenseLayer(network, n_units=3, act = tl.activation.identity, name='output_layer')
+    >>> states_batch_pl = tf.placeholder(tf.float32, shape=[None, D])
+    >>> network = InputLayer(states_batch_pl, name='input')
+    >>> network = DenseLayer(network, n_units=H, act=tf.nn.relu, name='relu1')
+    >>> network = DenseLayer(network, n_units=3, name='out')
     >>> probs = network.outputs
     >>> sampling_prob = tf.nn.softmax(probs)
     >>> actions_batch_pl = tf.placeholder(tf.int32, shape=[None])
     >>> discount_rewards_batch_pl = tf.placeholder(tf.float32, shape=[None])
-    >>> loss = cross_entropy_reward_loss(probs, actions_batch_pl, discount_rewards_batch_pl)
+    >>> loss = tl.rein.cross_entropy_reward_loss(probs, actions_batch_pl, discount_rewards_batch_pl)
     >>> train_op = tf.train.RMSPropOptimizer(learning_rate, decay_rate).minimize(loss)
     """
 
-    try: # TF 1.0
+    try: # TF 1.0+
         cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=actions, logits=logits, name=name)
     except:
         cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, targets=actions)
         # cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, actions)
 
-    try: ## TF1.0
+    try: ## TF1.0+
         loss = tf.reduce_sum(tf.multiply(cross_entropy, rewards))
     except: ## TF0.12
         loss = tf.reduce_sum(tf.mul(cross_entropy, rewards))   # element-wise mul
     return loss
+
+def log_weight(probs, weights, name='log_weight'):
+    """Log weight.
+
+    Parameters
+    -----------
+    probs : tensor
+        If it is a network output, usually we should scale it to [0, 1] via softmax.
+    weights : tensor
+    """
+    with tf.variable_scope(name):
+        exp_v = tf.reduce_mean(tf.log(probs) * weights)
+        return exp_v
+
+
+
+def choice_action_by_probs(probs=[0.5, 0.5], action_list=None):
+    """Choice and return an an action by given the action probability distribution.
+
+    Parameters
+    ------------
+    probs : a list of float.
+        The probability distribution of all actions.
+    action_list : None or a list of action in integer, string or others.
+        If None, returns an integer range between 0 and len(probs)-1.
+
+    Examples
+    ----------
+    >>> for _ in range(5):
+    >>>     a = choice_action_by_probs([0.2, 0.4, 0.4])
+    >>>     print(a)
+    ... 0
+    ... 1
+    ... 1
+    ... 2
+    ... 1
+    >>> for _ in range(3):
+    >>>     a = choice_action_by_probs([0.5, 0.5], ['a', 'b'])
+    >>>     print(a)
+    ... a
+    ... b
+    ... b
+    """
+    if action_list is None:
+        n_action = len(probs)
+        action_list = np.arange(n_action)
+    else:
+        assert len(action_list) == len(probs), "Number of actions should equal to number of probabilities."
+    return np.random.choice(action_list, p=probs)
diff --git a/tensorlayer/third_party/__init__.py b/tensorlayer/third_party/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tensorlayer/third_party/roi_pooling/.gitignore b/tensorlayer/third_party/roi_pooling/.gitignore
new file mode 100644
index 00000000..08030a8f
--- /dev/null
+++ b/tensorlayer/third_party/roi_pooling/.gitignore
@@ -0,0 +1,3 @@
+.ipynb_checkpoints/
+build/
+
diff --git a/tensorlayer/third_party/roi_pooling/README.md b/tensorlayer/third_party/roi_pooling/README.md
new file mode 100644
index 00000000..d597cea9
--- /dev/null
+++ b/tensorlayer/third_party/roi_pooling/README.md
@@ -0,0 +1,56 @@
+# Hint from TensorLayer
+- This implementation is from `https://github.com/deepsense-ai/roi-pooling`, date: 31 Aug 2017.
+- To install this, you have to clone TensorLayer from Github instead of pip install.
+- Remember to modify the `CUDA_LIB` in Makefile before running `python setup.py install` in this folder.
+- Make sure `roi_pooling_example.py` and `test_roi_layer.py` is runable.
+
+
+----
+
+ 
+## RoI pooling in TensorFlow
+
+This repo contains the implementation of **Region of Interest pooling** as a custom TensorFlow operation. The CUDA code responsible for the computations was largely taken from the original [Caffe implementation by Ross Girshick](https://github.com/rbgirshick/fast-rcnn).
+
+For more information about RoI pooling you can check out [Region of interest pooling explained](https://deepsense.io/region-of-interest-pooling-explained/) at our [deepsense.io](https://deepsense.io/) blog.
+
+![Region of Interest Pooling animation](roi_pooling_animation.gif)
+
+
+## Requirements
+
+To compile and use `roi_pooling` layer you need to have:
+
+* [CUDA](https://developer.nvidia.com/cuda-toolkit) (tested with 8.0)
+* [https://www.tensorflow.org/](TensorFlow) (tested with 0.12.0 and 1.0.0)
+
+Only official TensorFlow releases are currently supported. If you're using a custom built TensorFlow compiled with a different GCC version (e.g. 5.X) you may need to modify the makefile to [enable the new ABI version](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html).
+
+
+## Install
+
+Since it uses compilation
+
+```bash
+$ git clone git@github.com:deepsense-io/roi-pooling.git
+$ cd roi-pooling
+$ python setup.py install
+```
+
+Right now we provide only GPU implementation (no CPU at this time).
+
+
+## Usage
+
+After successful installation you can use the operation like this:
+
+```python
+from roi_pooling.roi_pooling_ops import roi_pooling
+
+# here obtain feature map and regions of interest
+rpooling = roi_pooling(feature_map, rois, 7, 7)
+# continue the model
+```
+
+Working example in Jupyter Notebook: [examples/roi_pooling_minimal_example.ipynb](https://github.com/deepsense-io/roi-pooling/blob/master/examples/roi_pooling_minimal_example.ipynb)
+
diff --git a/tensorlayer/third_party/roi_pooling/examples/__init__.py b/tensorlayer/third_party/roi_pooling/examples/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tensorlayer/third_party/roi_pooling/examples/roi_pooling_minimal_example.ipynb b/tensorlayer/third_party/roi_pooling/examples/roi_pooling_minimal_example.ipynb
new file mode 100644
index 00000000..c1edc353
--- /dev/null
+++ b/tensorlayer/third_party/roi_pooling/examples/roi_pooling_minimal_example.ipynb
@@ -0,0 +1,148 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* blog post: [Region of interest pooling explained - deepsense.io](https://deepsense.io/region-of-interest-pooling-explained/)\n",
+    "* repository: [deepsense-io/roi-pooling](https://github.com/deepsense-io/roi-pooling)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from __future__ import print_function\n",
+    "\n",
+    "import tensorflow as tf\n",
+    "import numpy as np\n",
+    "\n",
+    "from roi_pooling.roi_pooling_ops import roi_pooling"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# 4x4 feature map with only 1 channel\n",
+    "input_value = [[\n",
+    "    [[1], [2], [4], [4]],\n",
+    "    [[3], [4], [1], [2]],\n",
+    "    [[6], [2], [1], [7]],\n",
+    "    [[1], [3], [2], [8]]\n",
+    "]]\n",
+    "input_value = np.asarray(input_value, dtype='float32')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# regions of interest as lists of:\n",
+    "# feature map index, upper left, bottom right coordinates\n",
+    "rois_value = [\n",
+    "    [0, 0, 0, 1, 3],\n",
+    "    [0, 2, 2, 3, 3],\n",
+    "    [0, 1, 0, 3, 2]\n",
+    "]\n",
+    "rois_value = np.asarray(rois_value, dtype='int32')\n",
+    "\n",
+    "# in this case we have 3 RoI pooling operations:\n",
+    "# * channel 0, rectangular region (0, 0) to (1, 3)\n",
+    "#              xx..\n",
+    "#              xx..\n",
+    "#              xx..\n",
+    "#              xx..\n",
+    "#\n",
+    "# * channel 0, rectangular region (2, 2) to (3, 3)\n",
+    "#              ....\n",
+    "#              ....\n",
+    "#              ..xx\n",
+    "#              ..xx\n",
+    "# * channel 0, rectangular region (1, 0) to (3, 2)\n",
+    "#              ....\n",
+    "#              xxx.\n",
+    "#              xxx.\n",
+    "#              xxx."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[[[ 3.  4.]\n",
+      "   [ 6.  3.]]]\n",
+      "\n",
+      "\n",
+      " [[[ 1.  7.]\n",
+      "   [ 2.  8.]]]\n",
+      "\n",
+      "\n",
+      " [[[ 4.  4.]\n",
+      "   [ 4.  7.]]]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "input_featuremap = tf.placeholder(tf.float32)\n",
+    "rois = tf.placeholder(tf.int32)\n",
+    "input_const = tf.constant(input_value, tf.float32)\n",
+    "rois_const = tf.constant(rois_value, tf.int32)\n",
+    "y = roi_pooling(input_const, rois_const, pool_height=2, pool_width=2)\n",
+    "\n",
+    "with tf.Session('') as sess:\n",
+    "    y_output = sess.run(y, feed_dict={input_featuremap: input_value, rois: rois_value})\n",
+    "    print(y_output)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tensorlayer/third_party/roi_pooling/roi_pooling/Makefile b/tensorlayer/third_party/roi_pooling/roi_pooling/Makefile
new file mode 100644
index 00000000..db9de786
--- /dev/null
+++ b/tensorlayer/third_party/roi_pooling/roi_pooling/Makefile
@@ -0,0 +1,18 @@
+TF_INC = $(shell python -c 'import tensorflow as tf; print(tf.sysconfig.get_include())')
+CUDA_LIB = /usr/local/cuda-8.0/lib64
+
+all: clean build test
+
+build: roi_pooling.so
+
+roi_pooling.cu.o: roi_pooling.cu.cc
+	nvcc -std=c++11 -c -o $@ $? -I $(TF_INC) -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC -D _GLIBCXX_USE_CXX11_ABI=0
+
+roi_pooling.so: roi_pooling.cc roi_pooling.cu.o
+	g++ -std=c++11 -shared -o $@ $? -I $(TF_INC) -fPIC -lcudart -L$(CUDA_LIB) -D _GLIBCXX_USE_CXX11_ABI=0
+
+test: build
+	python roi_pooling_test.py
+
+clean:
+	rm -f *.o *.so *.pyc *.npy
diff --git a/tensorlayer/third_party/roi_pooling/roi_pooling/__init__.py b/tensorlayer/third_party/roi_pooling/roi_pooling/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tensorlayer/third_party/roi_pooling/roi_pooling/roi_pooling.cc b/tensorlayer/third_party/roi_pooling/roi_pooling/roi_pooling.cc
new file mode 100644
index 00000000..d1f123dc
--- /dev/null
+++ b/tensorlayer/third_party/roi_pooling/roi_pooling/roi_pooling.cc
@@ -0,0 +1,162 @@
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include <cstdio>
+#include <iostream>
+#include <typeinfo>
+
+using namespace tensorflow;
+using namespace std;
+
+REGISTER_OP("RoiPooling")
+.Input("input: float32")
+.Input("rois: int32")
+.Attr("pool_height: int")
+.Attr("pool_width: int")
+.Output("output: float32")
+.Output("argmax_output: int32");
+
+
+#define Dtype float
+
+void RoiPoolingKernelLauncher(const float* input, const int* rois, int n_rois, int channels, int height, int width,
+                              int pooled_height, int pooled_width, Dtype* output, int* argmax_output);
+
+// IMPORTANT(maciek): need info about storage of the data in memory, assumed something but need the docs confirming it
+
+class RoiPoolingOp : public OpKernel {
+    private:
+        int pool_height_, pool_width_;
+    public:
+        explicit RoiPoolingOp(OpKernelConstruction* context) : OpKernel(context) {
+                 OP_REQUIRES_OK(context,
+                   context->GetAttr("pool_height", &pool_height_));
+
+                 OP_REQUIRES_OK(context,
+                   context->GetAttr("pool_width", &pool_width_));
+        }
+
+
+        void Compute(OpKernelContext* context) override {
+            // Grab the input tensor
+            const Tensor& input_tensor = context->input(0);
+            const Tensor& rois_tensor = context->input(1);
+
+            auto input = input_tensor.flat<float>();
+            auto rois = rois_tensor.flat<int32>();
+
+            // Create an output tensor
+            Tensor* output_tensor = NULL;
+            Tensor* argmax_output_tensor = NULL;
+
+            auto input_shape = input_tensor.shape();
+            auto rois_shape = rois_tensor.shape();
+
+            int n_rois = rois_shape.dim_size(0);
+            int height = input_shape.dim_size(1);
+            int width = input_shape.dim_size(2);
+            int channels = input_shape.dim_size(3);
+
+            TensorShape output_shape = TensorShape({static_cast<int64>(n_rois),
+                                        static_cast<int64>(channels),
+                                        static_cast<int64>(pool_height_),
+                                        static_cast<int64>(pool_width_)});
+
+            OP_REQUIRES_OK(context, context->allocate_output(0, output_shape,
+                        &output_tensor));
+
+            OP_REQUIRES_OK(context, context->allocate_output(1, output_shape,
+                        &argmax_output_tensor));
+
+            auto output = output_tensor->template flat<float>();
+            auto argmax_output = argmax_output_tensor->template flat<int32>();
+
+            RoiPoolingKernelLauncher(input.data(), rois.data(),
+                n_rois, channels,
+                height, width,
+                pool_height_, pool_width_,
+                output.data(), argmax_output.data());
+        }
+};
+
+REGISTER_KERNEL_BUILDER(Name("RoiPooling").Device(DEVICE_GPU), RoiPoolingOp);
+
+///////////// RoiPoolingGrad
+
+
+REGISTER_OP("RoiPoolingGrad")
+.Input("orig_input: float32")
+.Input("orig_rois: int32")
+.Input("orig_output: float32")
+.Input("orig_argmax_output: int32")
+.Input("orig_output_grad: float32")
+.Attr("pool_height: int")
+.Attr("pool_width: int")
+.Output("output: float32")
+.Doc(R"doc(
+ region of interest pooling grad
+)doc");
+
+#define Dtype float
+void RoiPoolingGradKernelLauncher(const Dtype* orig_input, const int* orig_rois,
+                                 int mb_size,
+                                 int n_rois, int channels, int height, int width,
+                                 int pooled_height, int pooled_width,
+                                 const Dtype* orig_output, const int* orig_argmax_output,
+                                 const Dtype* orig_output_grad,
+                                 Dtype* output);
+
+// IMPORTANT(maciek): need info about storage of the data in memory, assumed something but need the docs confirming it
+
+class RoiPoolingGradOp : public OpKernel {
+    private:
+        int pool_height_, pool_width_;
+    public:
+        explicit RoiPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
+                 OP_REQUIRES_OK(context,
+                   context->GetAttr("pool_height", &pool_height_));
+
+                 OP_REQUIRES_OK(context,
+                   context->GetAttr("pool_width", &pool_width_));
+        }
+
+
+        void Compute(OpKernelContext* context) override {
+            // Grab the input tensor
+            const Tensor& orig_input_tensor = context->input(0);
+            const Tensor& orig_rois_tensor = context->input(1);
+            const Tensor& orig_output_tensor = context->input(2);
+            const Tensor& orig_argmax_output_tensor = context->input(3);
+            const Tensor& orig_output_grad_tensor = context->input(4);
+
+            auto orig_input = orig_input_tensor.flat<float>();
+            auto orig_rois = orig_rois_tensor.flat<int32>();
+            auto orig_output = orig_output_tensor.flat<float>();
+            auto orig_argmax_output = orig_argmax_output_tensor.flat<int32>();
+            auto orig_output_grad = orig_output_grad_tensor.flat<float>();
+
+            // Create an output tensor
+            Tensor* output_tensor = NULL;
+            auto orig_input_shape = orig_input_tensor.shape();
+            auto orig_rois_shape = orig_rois_tensor.shape();
+            auto grads_shape = orig_input_shape;
+
+            int mb_size = orig_input_shape.dim_size(0);
+            int n_rois = orig_rois_shape.dim_size(0);
+            int height = orig_input_shape.dim_size(1);
+            int width = orig_input_shape.dim_size(2);
+            int channels = orig_input_shape.dim_size(3);
+
+            OP_REQUIRES_OK(context, context->allocate_output(0, grads_shape,
+                        &output_tensor));
+
+            auto output = output_tensor->template flat<float>();
+
+            // Call the cuda kernel launcher
+            RoiPoolingGradKernelLauncher(orig_input.data(), orig_rois.data(),
+                mb_size, n_rois, channels, height, width, pool_height_, pool_width_,
+                orig_output.data(), orig_argmax_output.data(), orig_output_grad.data(), output.data());
+        }
+};
+
+
+REGISTER_KERNEL_BUILDER(Name("RoiPoolingGrad").Device(DEVICE_GPU), RoiPoolingGradOp);
diff --git a/tensorlayer/third_party/roi_pooling/roi_pooling/roi_pooling.cu.cc b/tensorlayer/third_party/roi_pooling/roi_pooling/roi_pooling.cu.cc
new file mode 100644
index 00000000..bbacb552
--- /dev/null
+++ b/tensorlayer/third_party/roi_pooling/roi_pooling/roi_pooling.cu.cc
@@ -0,0 +1,214 @@
+#if GOOGLE_CUDA
+
+#include <iostream>
+#include <stdio.h>
+#define EIGEN_USE_GPU
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+// CUDA: index helpers
+#define idx4_4(index, d1, d2, d3, d4) (index % d4)
+#define idx4_3(index, d1, d2, d3, d4) ((index / d4) % d3)
+#define idx4_2(index, d1, d2, d3, d4) ((index / d4 / d3) % d2)
+#define idx4_1(index, d1, d2, d3, d4) ((index / d4 / d3 / d2) %d1)
+
+// CUDA: various checks for different function calls.
+#define CUDA_CHECK(condition) \
+  /* Code block avoids redefinition of cudaError_t error */ \
+  do { \
+    cudaError_t error = condition; \
+    if (error != cudaSuccess) { \
+      return 1; \
+    } \
+  } while (0)
+
+// CUDA: grid stride looping
+#define CUDA_KERNEL_LOOP(i, n) \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+       i < (n); \
+       i += blockDim.x * gridDim.x)
+
+// CUDA: use 512 threads per block
+const int CAFFE_CUDA_NUM_THREADS = 512;
+
+// CUDA: number of blocks for threads.
+inline int CAFFE_GET_BLOCKS(const int N) {
+  // TODO rewrite this part to be consistent with tf conventions
+  int optimal_number_of_blocks = (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
+  int max_number_of_blocks = 65000;
+  return std::min(optimal_number_of_blocks, max_number_of_blocks);
+}
+
+
+#define Dtype float
+
+__global__ void RoiPoolingKernel(const Dtype* input, const int* rois,
+                                 int n_rois, int channels, int height, int width,
+                                 int pooled_height, int pooled_width,
+                                 Dtype* output, int* argmax_output) {
+    int output_size = n_rois * channels * pooled_height * pooled_width;
+
+    CUDA_KERNEL_LOOP(index, output_size) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = idx4_4(index, n_rois, channels, pooled_height, pooled_width);
+    int ph = idx4_3(index, n_rois, channels, pooled_height, pooled_width);
+    int c = idx4_2(index, n_rois, channels, pooled_height, pooled_width);
+    int n = idx4_1(index, n_rois, channels, pooled_height, pooled_width);
+
+    auto bottom_rois_act = rois + n * 5;
+
+    int roi_batch_ind = bottom_rois_act[0];
+    int roi_start_w = bottom_rois_act[1];
+    int roi_start_h = bottom_rois_act[2];
+    int roi_end_w = bottom_rois_act[3];
+    int roi_end_h = bottom_rois_act[4];
+
+    // Force malformed ROIs to be 1x1
+    // NOTE(maciek): roi_start, roi_end seems to be inclusive
+    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+
+    // divide the ROIs into smaller regions for max pooling
+    Dtype bin_size_h = static_cast<Dtype>(roi_height) / static_cast<Dtype>(pooled_height);
+    Dtype bin_size_w = static_cast<Dtype>(roi_width) / static_cast<Dtype>(pooled_width);
+
+    // compute the precise coordinates of each pooling subregion of the ROIs
+    int hstart = static_cast<int>(floor(static_cast<Dtype>(ph) * bin_size_h));
+    int wstart = static_cast<int>(floor(static_cast<Dtype>(pw) * bin_size_w));
+    int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1) * bin_size_h));
+    int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1) * bin_size_w));
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart + roi_start_h, 0), height);
+    hend = min(max(hend + roi_start_h, 0), height);
+    wstart = min(max(wstart + roi_start_w, 0), width);
+    wend = min(max(wend + roi_start_w, 0), width);
+
+    //printf("%d %d %d %d %d %d %d %d\n", n, c, pw, ph, hstart, hend, wstart, wend);
+
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    // Define an empty pooling region to be zero
+
+    Dtype maxval = is_empty ? 0 : -999999999.0;
+    //Dtype maxval = is_empty ? 0 : -FLT_MAX;
+    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
+
+    int maxidx = -1;
+    auto input_act = input + (roi_batch_ind * height * width * channels);
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int bottom_index = (h * width + w) * channels + c;
+
+        // bottom index is relative to 2d image only
+        if (input_act[bottom_index] > maxval) {
+          maxval = input_act[bottom_index];
+          maxidx = bottom_index;
+        }
+      }
+    }
+    output[index] = maxval;
+    argmax_output[index] = maxidx;
+  }
+}
+
+
+void RoiPoolingKernelLauncher(const float* input, const int* rois, int n_rois, int channels, int height, int width,
+                              int pooled_height, int pooled_width, Dtype* output, int* argmax_output) {
+    int out_size = n_rois * channels * pooled_height * pooled_width;
+
+    RoiPoolingKernel<<<CAFFE_GET_BLOCKS(out_size), CAFFE_CUDA_NUM_THREADS>>>(input, rois, n_rois, channels, height, width,
+        pooled_height, pooled_width, output, argmax_output);
+}
+
+
+/////////////// Grad
+__global__ void RoiPoolingGradKernel(const Dtype* orig_input, const int* orig_rois,
+                                 int mb_size,
+                                 int n_rois, int channels, int height, int width,
+                                 int pooled_height, int pooled_width,
+                                 const Dtype* orig_output, const int* orig_argmax_output,
+                                 const Dtype* orig_output_grad,
+                                 Dtype* output) {
+
+    int orig_input_size = mb_size * height * width * channels;
+
+    CUDA_KERNEL_LOOP(index, orig_input_size) {
+    // (n, h, w, c) coords in bottom data
+    int c = idx4_4(index, mb_size, height, width, channels);
+    int w = idx4_3(index, mb_size, height, width, channels);
+    int h = idx4_2(index, mb_size, height, width, channels);
+    int n = idx4_1(index, mb_size, height, width, channels);
+
+    Dtype gradient = 0;
+    // Accumulate gradient over all ROIs that pooled this element
+    for (int roi_n = 0; roi_n < n_rois; ++roi_n) {
+      const int* offset_bottom_rois = orig_rois + roi_n * 5;
+      int roi_batch_ind = offset_bottom_rois[0];
+      // Skip if ROI's batch index doesn't match n
+      if (n != roi_batch_ind) {
+        continue;
+      }
+
+      int roi_start_w = offset_bottom_rois[1];
+      int roi_start_h = offset_bottom_rois[2];
+      int roi_end_w = offset_bottom_rois[3];
+      int roi_end_h = offset_bottom_rois[4];
+
+      // Skip if ROI doesn't include (h, w)
+      const bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
+                           h >= roi_start_h && h <= roi_end_h);
+      if (!in_roi) {
+        continue;
+      }
+
+      int offset = (roi_n * channels + c) * pooled_height * pooled_width;
+      const Dtype* offset_top_diff = orig_output_grad + offset;
+      const int* offset_argmax_data = orig_argmax_output + offset;
+
+      // Compute feasible set of pooled units that could have pooled
+      // this bottom unit
+
+      // Force malformed ROIs to be 1x1
+      int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+      int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+
+      Dtype bin_size_h = static_cast<Dtype>(roi_height) / static_cast<Dtype>(pooled_height);
+      Dtype bin_size_w = static_cast<Dtype>(roi_width) / static_cast<Dtype>(pooled_width);
+
+      int phstart = floor(static_cast<Dtype>(h - roi_start_h) / bin_size_h);
+      int phend = ceil(static_cast<Dtype>(h - roi_start_h + 1) / bin_size_h);
+      int pwstart = floor(static_cast<Dtype>(w - roi_start_w) / bin_size_w);
+      int pwend = ceil(static_cast<Dtype>(w - roi_start_w + 1) / bin_size_w);
+
+      phstart = min(max(phstart, 0), pooled_height);
+      phend = min(max(phend, 0), pooled_height);
+      pwstart = min(max(pwstart, 0), pooled_width);
+      pwend = min(max(pwend, 0), pooled_width);
+
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          if (offset_argmax_data[ph * pooled_width + pw] == (h * width + w)) {
+            gradient += offset_top_diff[ph * pooled_width + pw];
+          }
+        }
+      }
+    }
+    output[index] = gradient;
+  }
+
+}
+
+void RoiPoolingGradKernelLauncher(const Dtype* orig_input, const int* orig_rois,
+                                 int mb_size,
+                                 int n_rois, int channels, int height, int width,
+                                 int pooled_height, int pooled_width,
+                                 const Dtype* orig_output, const int* orig_argmax_output,
+                                 const Dtype* orig_output_grad,
+                                 Dtype* output) {
+    int out_size = mb_size * height * width * channels;
+    RoiPoolingGradKernel<<<CAFFE_GET_BLOCKS(out_size), CAFFE_CUDA_NUM_THREADS>>>(orig_input, orig_rois,
+        mb_size, n_rois, channels, height, width, pooled_height, pooled_width,
+        orig_output, orig_argmax_output, orig_output_grad, output);
+}
+
+#endif
diff --git a/tensorlayer/third_party/roi_pooling/roi_pooling/roi_pooling_ops.py b/tensorlayer/third_party/roi_pooling/roi_pooling/roi_pooling_ops.py
new file mode 100644
index 00000000..5c46dc37
--- /dev/null
+++ b/tensorlayer/third_party/roi_pooling/roi_pooling/roi_pooling_ops.py
@@ -0,0 +1,51 @@
+import tensorflow as tf
+from tensorflow.python.framework import ops
+import os
+
+module_path = os.path.realpath(__file__)
+module_dir = os.path.dirname(module_path)
+lib_path = os.path.join(module_dir, 'roi_pooling.so')
+roi_pooling_module = tf.load_op_library(lib_path)
+
+def roi_pooling(input, rois, pool_height, pool_width):
+    """
+      returns a tensorflow operation for computing the Region of Interest Pooling
+    
+      @arg input: feature maps on which to perform the pooling operation
+      @arg rois: list of regions of interest in the format (feature map index, upper left, bottom right)
+      @arg pool_width: size of the pooling sections
+    """
+    # TODO(maciek): ops scope
+    out = roi_pooling_module.roi_pooling(input, rois, pool_height=pool_height, pool_width=pool_width)
+    output, argmax_output = out[0], out[1]
+    return output
+
+
+@ops.RegisterGradient("RoiPooling")
+def _RoiPoolingGrad(op, *grads):
+    orig_inputs = op.inputs[0]
+    orig_rois = op.inputs[1]
+    orig_output = op.outputs[0]
+    orig_argmax_output = op.outputs[1]
+
+    orig_output_grad = grads[0]
+    output_grad = roi_pooling_module.roi_pooling_grad(orig_inputs, orig_rois, orig_output,
+                                                      orig_argmax_output, orig_output_grad,
+                                                      pool_height=op.get_attr('pool_height'),
+                                                      pool_width=op.get_attr('pool_width'))
+    return [output_grad, None]
+
+
+@ops.RegisterShape("RoiPooling")
+def _RoiPoolingShape(op):
+    input = op.inputs[0]
+    rois = op.inputs[1]
+
+    n_rois = rois.get_shape()[0]
+    n_channels = input.get_shape()[3]
+    pool_height = op.get_attr('pool_height')
+    pool_width = op.get_attr('pool_width')
+
+    #TODO: check the width/hegiht order
+    return [tf.TensorShape([n_rois, n_channels, pool_width, pool_height]),
+            tf.TensorShape(None)]
diff --git a/tensorlayer/third_party/roi_pooling/roi_pooling/roi_pooling_test.py b/tensorlayer/third_party/roi_pooling/roi_pooling/roi_pooling_test.py
new file mode 100644
index 00000000..c5f1b361
--- /dev/null
+++ b/tensorlayer/third_party/roi_pooling/roi_pooling/roi_pooling_test.py
@@ -0,0 +1,110 @@
+import tensorflow as tf
+import numpy as np
+from roi_pooling_ops import roi_pooling
+
+
+class RoiPoolingTest(tf.test.TestCase):
+    # TODO(maciek): add python, implementation and test outputs
+    # TODO(maciek): test pool_height != pool_width, height != width
+
+    def test_roi_pooling_grad(self):
+        # TODO(maciek): corner cases
+        input_value = [[
+            [[1], [2], [4], [4]],
+            [[3], [4], [1], [2]],
+            [[6], [2], [1], [7.0]],
+            [[1], [3], [2], [8]]
+        ]]
+        input_value = np.asarray(input_value, dtype='float32')
+
+        rois_value = [
+            [0, 0, 0, 1, 1],
+            [0, 1, 1, 2, 2],
+            [0, 2, 2, 3, 3],
+            [0, 0, 0, 2, 2],
+            [0, 0, 0, 3, 3]
+        ]
+        rois_value = np.asarray(rois_value, dtype='int32')
+
+        with tf.Session(''):
+            # NOTE(maciek): looks like we have to use consts here, based on tensorflow/python/ops/nn_test.py
+            input_const = tf.constant(input_value, tf.float32)
+            rois_const = tf.constant(rois_value, tf.int32)
+            y = roi_pooling(input_const, rois_const, pool_height=2, pool_width=2)
+            mean = tf.reduce_mean(y)
+
+            numerical_grad_error_1 = tf.test.compute_gradient_error(
+                [input_const], [input_value.shape], y, [5, 2, 2, 1])
+
+            numerical_grad_error_2 = tf.test.compute_gradient_error(
+                [input_const], [input_value.shape], mean, [])
+
+            self.assertLess(numerical_grad_error_1, 1e-4)
+            self.assertLess(numerical_grad_error_2, 1e-4)
+
+    def test_shape_inference_1(self):
+        pooled_w, pooled_h = 2, 2
+        input_w, input_h = 200, 200
+        n_channels = 3
+        n_batches = None
+        input = tf.placeholder(tf.float32, shape=[n_batches, input_w, input_h, n_channels])
+
+        n_rois = None
+        single_roi_dimension = 5
+        rois = tf.placeholder(tf.int32, shape=[n_rois, single_roi_dimension])
+
+        y = roi_pooling(input, rois, pool_height=pooled_w, pool_width=pooled_h)
+
+        self.assertEqual(y.get_shape().ndims, 4)
+        self.assertIs(y.get_shape()[0].value, n_rois)
+        self.assertIs(y.get_shape()[1].value, n_channels)
+        self.assertIs(y.get_shape()[2].value, pooled_h)
+        self.assertIs(y.get_shape()[3].value, pooled_w)
+
+    def test_shape_inference_2(self):
+        pooled_w, pooled_h = 3, 4
+        input_w, input_h = 200, 300
+        n_channels = 3
+        n_batches = None
+        input = tf.placeholder(tf.float32, shape=[n_batches, input_w, input_h, n_channels])
+
+        n_rois = None
+        single_roi_dimension = 5
+        rois = tf.placeholder(tf.int32, shape=[n_rois, single_roi_dimension])
+
+        y = roi_pooling(input, rois, pool_height=pooled_w, pool_width=pooled_h)
+
+        self.assertEqual(y.get_shape().ndims, 4)
+        self.assertIs(y.get_shape()[0].value, n_rois)
+        self.assertIs(y.get_shape()[1].value, n_channels)
+        self.assertIs(y.get_shape()[2].value, pooled_h)
+        self.assertIs(y.get_shape()[3].value, pooled_w)
+
+    def test_very_big_output(self):
+        """
+        This test checks whether the layer can handle a corner case
+        where the number of output pixels is very large, possibly larger
+        than the number of available GPU threads
+        """
+
+        pooled_w, pooled_h = 7,7
+        input_w, input_h = 72, 240
+        n_channels = 512
+        n_batches = 2
+        x_input = np.ones(shape=(n_batches, input_w, input_h, n_channels))
+        n_rois = 5000
+        rois_input = np.ones(shape=(n_rois, 5))
+
+        input = tf.placeholder(tf.float32, shape=[n_batches, input_w, input_h, n_channels])
+        single_roi_dimension = 5
+        rois = tf.placeholder(tf.int32, shape=[n_rois, single_roi_dimension])
+
+        y = roi_pooling(input, rois, pool_height=pooled_w, pool_width=pooled_h)
+
+        with tf.Session('') as sess:
+            y_output = sess.run(y, feed_dict={input: x_input, rois: rois_input})
+
+        self.assertTrue(np.all(y_output == 1))
+
+if __name__ == '__main__':
+    tf.test.main()
diff --git a/tensorlayer/third_party/roi_pooling/roi_pooling_animation.gif b/tensorlayer/third_party/roi_pooling/roi_pooling_animation.gif
new file mode 100644
index 00000000..9d35d21a
Binary files /dev/null and b/tensorlayer/third_party/roi_pooling/roi_pooling_animation.gif differ
diff --git a/tensorlayer/third_party/roi_pooling/roi_pooling_example.py b/tensorlayer/third_party/roi_pooling/roi_pooling_example.py
new file mode 100644
index 00000000..7d9b7b63
--- /dev/null
+++ b/tensorlayer/third_party/roi_pooling/roi_pooling_example.py
@@ -0,0 +1,63 @@
+from __future__ import print_function
+
+import tensorflow as tf
+import numpy as np
+
+from roi_pooling.roi_pooling_ops import roi_pooling
+
+# input feature map going into the RoI pooling 
+input_value = [[
+    [[1], [2], [4], [4]],
+    [[3], [4], [1], [2]],
+    [[6], [2], [1], [7.0]],
+    [[1], [3], [2], [8]]
+]]
+input_value = np.asarray(input_value, dtype='float32')
+
+# Regions of interest as lists of:
+# feature map index, upper left, bottom right coordinates
+rois_value = [
+    [0, 0, 0, 1, 1],
+    [0, 1, 1, 2, 2],
+    [0, 2, 2, 3, 3],
+    [0, 0, 0, 2, 2],
+    [0, 0, 0, 3, 3]
+]
+rois_value = np.asarray(rois_value, dtype='int32')
+
+# the pool_height and width are parameters of the ROI layer
+pool_height, pool_width = (2, 2)
+n_rois = len(rois_value)
+y_shape = [n_rois, 1, pool_height, pool_width]
+
+print('Input: ', input_value, ', shape: ', input_value.shape)
+print('ROIs: ', rois_value, ', shape: ', rois_value.shape)
+
+# precise semantics is now only defined by the kernel, need tests
+input = tf.placeholder(tf.float32)
+rois = tf.placeholder(tf.int32)
+
+y = roi_pooling(input, rois, pool_height=2, pool_width=2)
+mean = tf.reduce_mean(y)
+
+grads = tf.gradients(mean, input)
+print(type(grads))
+print(len(grads))
+print(grads)
+print(input_value.shape)
+
+with tf.Session('') as sess:
+    input_const = tf.constant(input_value, tf.float32)
+    rois_const = tf.constant(rois_value, tf.int32)
+    y = roi_pooling(input_const, rois_const, pool_height=2, pool_width=2)
+    mean = tf.reduce_mean(y)
+
+    numerical_grad_error_1 = tf.test.compute_gradient_error([input_const], [input_value.shape], y, y_shape)
+    numerical_grad_error_2 = tf.test.compute_gradient_error([input_const], [input_value.shape], mean, [])
+    print(numerical_grad_error_1, numerical_grad_error_2)
+
+with tf.Session('') as sess:
+    y_output = sess.run(y, feed_dict={input: input_value, rois: rois_value})
+    print('y: ', y_output)
+    grads_output = sess.run(grads, feed_dict={input: input_value, rois: rois_value})
+    print('grads: ', grads_output)
diff --git a/tensorlayer/third_party/roi_pooling/setup.py b/tensorlayer/third_party/roi_pooling/setup.py
new file mode 100644
index 00000000..de392a9d
--- /dev/null
+++ b/tensorlayer/third_party/roi_pooling/setup.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+from distutils.core import setup
+from distutils.command.install import install as DistutilsInstall
+import sys
+import subprocess
+
+try:
+    import tensorflow
+except ImportError:
+    print("Please install tensorflow 0.12.0 or later")
+    sys.exit()
+    
+
+class MyInstall(DistutilsInstall):
+    def run(self):
+        subprocess.call(['make', '-C', 'roi_pooling', 'build'])
+        DistutilsInstall.run(self)
+
+setup(name='roi-pooling',
+            version='1.0',
+            description='ROI pooling as a custom TensorFlow operation',
+            author='deepsense.io',
+            packages=['roi_pooling'],
+            package_data={'roi_pooling': ['roi_pooling.so']},
+            cmdclass={'install': MyInstall}
+)
+
+    
+
+
diff --git a/tensorlayer/third_party/roi_pooling/test_roi_layer.py b/tensorlayer/third_party/roi_pooling/test_roi_layer.py
new file mode 100644
index 00000000..5ca6a12a
--- /dev/null
+++ b/tensorlayer/third_party/roi_pooling/test_roi_layer.py
@@ -0,0 +1,66 @@
+from tensorlayer.layers import *
+
+from tensorlayer.third_party.roi_pooling.roi_pooling.roi_pooling_ops import roi_pooling
+# from roi_pooling.roi_pooling_ops import roi_pooling
+
+
+
+# input feature map going into the RoI pooling
+input_value = [[
+    [[1], [2], [4], [4]],
+    [[3], [4], [1], [2]],
+    [[6], [2], [1], [7.0]],
+    [[1], [3], [2], [8]]
+]]
+input_value = np.asarray(input_value, dtype='float32')
+
+# Regions of interest as lists of:
+# feature map index, upper left, bottom right coordinates
+rois_value = [
+    [0, 0, 0, 1, 1],
+    [0, 1, 1, 2, 2],
+    [0, 2, 2, 3, 3],
+    [0, 0, 0, 2, 2],
+    [0, 0, 0, 3, 3]
+]
+rois_value = np.asarray(rois_value, dtype='int32')
+
+# the pool_height and width are parameters of the ROI layer
+pool_height, pool_width = (2, 2)
+n_rois = len(rois_value)
+y_shape = [n_rois, 1, pool_height, pool_width]
+
+print('Input: ', input_value, ', shape: ', input_value.shape)
+print('ROIs: ', rois_value, ', shape: ', rois_value.shape)
+
+# precise semantics is now only defined by the kernel, need tests
+input = tf.placeholder(tf.float32)
+rois = tf.placeholder(tf.int32)
+
+# y = roi_pooling(input, rois, pool_height=2, pool_width=2)
+n = InputLayer(input, name='in')
+n = ROIPoolingLayer(n, rois=rois, pool_height=2, pool_width=2, name='roi')
+y = n.outputs
+mean = tf.reduce_mean(y)
+
+grads = tf.gradients(mean, input)
+print(type(grads))
+print(len(grads))
+print(grads)
+print(input_value.shape)
+
+with tf.Session('') as sess:
+    input_const = tf.constant(input_value, tf.float32)
+    rois_const = tf.constant(rois_value, tf.int32)
+    y = roi_pooling(input_const, rois_const, pool_height=2, pool_width=2)
+    mean = tf.reduce_mean(y)
+
+    numerical_grad_error_1 = tf.test.compute_gradient_error([input_const], [input_value.shape], y, y_shape)
+    numerical_grad_error_2 = tf.test.compute_gradient_error([input_const], [input_value.shape], mean, [])
+    print(numerical_grad_error_1, numerical_grad_error_2)
+
+with tf.Session('') as sess:
+    y_output = sess.run(y, feed_dict={input: input_value, rois: rois_value})
+    print('y: ', y_output)
+    grads_output = sess.run(grads, feed_dict={input: input_value, rois: rois_value})
+    print('grads: ', grads_output)
diff --git a/tensorlayer/utils.py b/tensorlayer/utils.py
index 0dbdbdc1..12a3f4bb 100644
--- a/tensorlayer/utils.py
+++ b/tensorlayer/utils.py
@@ -1,5 +1,5 @@
 #! /usr/bin/python
-# -*- coding: utf8 -*-
+# -*- coding: utf-8 -*-
 import tensorflow as tf
 import tensorlayer as tl
 from . import iterate
@@ -67,11 +67,11 @@ def fit(sess, network, train_op, cost, X_train, y_train, x, y_, acc=None, batch_
     ...            X_val=X_val, y_val=y_val, eval_train=False,
     ...            tensorboard=True, tensorboard_weight_histograms=True, tensorboard_graph_vis=True)
 
-    Note
+    Notes
     --------
-        If tensorboard=True, the global_variables_initializer will be run inside the fit function
-        in order to initalize the automatically generated summary nodes used for tensorboard visualization,
-        thus tf.global_variables_initializer().run() before the fit() call will be undefined.
+    If tensorboard=True, the global_variables_initializer will be run inside the fit function
+    in order to initalize the automatically generated summary nodes used for tensorboard visualization,
+    thus tf.global_variables_initializer().run() before the fit() call will be undefined.
     """
     assert X_train.shape[0] >= batch_size, "Number of training examples should be bigger than the batch size"
 
@@ -130,7 +130,7 @@ def fit(sess, network, train_op, cost, X_train, y_train, x, y_, acc=None, batch_
                     result = sess.run(merged, feed_dict=feed_dict)
                     train_writer.add_summary(result, tensorboard_train_index)
                     tensorboard_train_index += 1
-                if (X_val is not None) and (y_val is not None):                      
+                if (X_val is not None) and (y_val is not None):
                         for X_val_a, y_val_a in iterate.minibatches(
                                         X_val, y_val, batch_size, shuffle=True):
                                 dp_dict = dict_to_one( network.all_drop )    # disable noise layers
@@ -178,7 +178,6 @@ def fit(sess, network, train_op, cost, X_train, y_train, x, y_, acc=None, batch_
                 print("Epoch %d of %d took %fs, loss %f" % (epoch + 1, n_epoch, time.time() - start_time, loss_ep))
     print("Total training time: %fs" % (time.time() - start_time_begin))
 
-
 def test(sess, network, acc, X_test, y_test, x, y_, batch_size, cost=None):
     """
     Test a given non time-series network by the given test data and metric.
@@ -237,7 +236,6 @@ def test(sess, network, acc, X_test, y_test, x, y_, batch_size, cost=None):
             print("   test loss: %f" % (test_loss/ n_batch))
         print("   test acc: %f" % (test_acc/ n_batch))
 
-
 def predict(sess, network, X, x, y_op, batch_size=None):
     """
     Return the predict results of given non time-series network.
@@ -281,7 +279,21 @@ def predict(sess, network, X, x, y_op, batch_size=None):
             if result is None:
                 result = result_a
             else:
-                result = np.hstack((result, result_a))
+                result = np.vstack((result, result_a))
+        if result is None:
+            if len(X) % batch_size != 0:
+                dp_dict = dict_to_one(network.all_drop)
+                feed_dict = {x:  X[-(len(X) % batch_size):, :], }
+                feed_dict.update(dp_dict)
+                result_a = sess.run(y_op, feed_dict=feed_dict)
+                result = result_a
+        else:
+            if len(X) != len(result) and len(X) % batch_size != 0:
+                dp_dict = dict_to_one(network.all_drop)
+                feed_dict = {x: X[-(len(X) % batch_size):, :], }
+                feed_dict.update(dp_dict)
+                result_a = sess.run(y_op, feed_dict=feed_dict)
+                result = np.vstack((result, result_a))
         return result
 
 
@@ -349,7 +361,6 @@ def flatten_list(list_of_list=[[],[]]):
     """
     return sum(list_of_list, [])
 
-
 def class_balancing_oversample(X_train=None, y_train=None, printable=True):
     """Input the features and labels, return the features and labels after oversampling.
 
@@ -430,6 +441,7 @@ def class_balancing_oversample(X_train=None, y_train=None, printable=True):
     # ================ End of Classes balancing
     return X_train, y_train
 
+
 ## Random
 def get_random_int(min=0, max=10, number=5, seed=None):
     """Return a list of random integer by the given range and quantity.
@@ -445,6 +457,13 @@ def get_random_int(min=0, max=10, number=5, seed=None):
     # return [random.randint(min,max) for p in range(0, number)]
     return [rnd.randint(min,max) for p in range(0, number)]
 
+def list_string_to_dict(string):
+    """Inputs ``['a', 'b', 'c']``, returns ``{'a': 0, 'b': 1, 'c': 2}``."""
+    dictionary = {}
+    for idx, c in enumerate(string):
+        dictionary.update({c:idx})
+    return dictionary
+
 #
 # def class_balancing_sequence_4D(X_train, y_train, sequence_length, model='downsampling' ,printable=True):
 #     ''' 输入、输出都是sequence format
diff --git a/tensorlayer/visualize.py b/tensorlayer/visualize.py
index 33868274..86b65f07 100644
--- a/tensorlayer/visualize.py
+++ b/tensorlayer/visualize.py
@@ -1,19 +1,50 @@
 #! /usr/bin/python
-# -*- coding: utf8 -*-
+# -*- coding: utf-8 -*-
 
 
 import matplotlib
-matplotlib.use('Agg')
-import matplotlib.pyplot as plt
-# import matplotlib.pyplot as plt
+## use this, if you got the following error:
+#  _tkinter.TclError: no display name and no $DISPLAY environment variable
+# matplotlib.use('Agg')
+
 import numpy as np
 import os
+from . import prepro
 
-
-## Save images
+# save/read image(s)
 import scipy.misc
 
-def save_image(image, image_path):
+def read_image(image, path=''):
+    """ Read one image.
+
+    Parameters
+    -----------
+    images : string, file name.
+    path : string, path.
+    """
+    return scipy.misc.imread(os.path.join(path, image))
+
+def read_images(img_list, path='', n_threads=10, printable=True):
+    """ Returns all images in list by given path and name of each image file.
+
+    Parameters
+    -------------
+    img_list : list of string, the image file names.
+    path : string, image folder path.
+    n_threads : int, number of thread to read image.
+    printable : bool, print infomation when reading images, default is True.
+    """
+    imgs = []
+    for idx in range(0, len(img_list), n_threads):
+        b_imgs_list = img_list[idx : idx + n_threads]
+        b_imgs = prepro.threading_data(b_imgs_list, fn=read_image, path=path)
+        # print(b_imgs.shape)
+        imgs.extend(b_imgs)
+        if printable:
+            print('read %d from %s' % (len(imgs), path))
+    return imgs
+
+def save_image(image, image_path=''):
     """Save one image.
 
     Parameters
@@ -21,9 +52,12 @@ def save_image(image, image_path):
     images : numpy array [w, h, c]
     image_path : string.
     """
-    scipy.misc.imsave(image_path, image)
+    try: # RGB
+        scipy.misc.imsave(image_path, image)
+    except: # Greyscale
+        scipy.misc.imsave(image_path, image[:,:,0])
 
-def save_images(images, size, image_path):
+def save_images(images, size, image_path=''):
     """Save mutiple images into one single image.
 
     Parameters
@@ -53,6 +87,78 @@ def imsave(images, size, path):
     assert len(images) <= size[0] * size[1], "number of images should be equal or less than size[0] * size[1] {}".format(len(images))
     return imsave(images, size, image_path)
 
+# for object detection
+def draw_boxes_and_labels_to_image(image, classes=[], coords=[],
+                scores=[], classes_list=[],
+                is_center=True, is_rescale=True, save_name=None):
+    """ Draw bboxes and class labels on image. Return or save the image with bboxes, example in the docs of ``tl.prepro``.
+
+    Parameters
+    -----------
+    image : RGB image in numpy.array, [height, width, channel].
+    classes : a list of class ID (int).
+    coords : a list of list for coordinates.
+        - Should be [x, y, x2, y2] (up-left and botton-right format)
+        - If [x_center, y_center, w, h] (set is_center to True).
+    scores : a list of score (float). (Optional)
+    classes_list : list of string, for converting ID to string on image.
+    is_center : boolean, defalt is True.
+        If coords is [x_center, y_center, w, h], set it to True for converting [x_center, y_center, w, h] to [x, y, x2, y2] (up-left and botton-right).
+        If coords is [x1, x2, y1, y2], set it to False.
+    is_rescale : boolean, defalt is True.
+        If True, the input coordinates are the portion of width and high, this API will scale the coordinates to pixel unit internally.
+        If False, feed the coordinates with pixel unit format.
+    save_name : None or string
+        The name of image file (i.e. image.png), if None, not to save image.
+
+    References
+    -----------
+    - OpenCV rectangle and putText.
+    - `scikit-image <http://scikit-image.org/docs/dev/api/skimage.draw.html#skimage.draw.rectangle>`_.
+    """
+    assert len(coords) == len(classes), "number of coordinates and classes are equal"
+    if len(scores) > 0:
+        assert len(scores) == len(classes), "number of scores and classes are equal"
+
+    import cv2
+
+        # image = copy.copy(image)    # don't change the original image
+    image = image.copy()    # don't change the original image, and avoid error https://stackoverflow.com/questions/30249053/python-opencv-drawing-errors-after-manipulating-array-with-numpy
+
+    imh, imw = image.shape[0:2]
+    thick = int((imh + imw) // 430)
+
+    for i in range(len(coords)):
+        if is_center:
+            x, y, x2, y2 = prepro.obj_box_coord_centroid_to_upleft_butright(coords[i])
+        else:
+            x, y, x2, y2 = coords[i]
+
+        if is_rescale: # scale back to pixel unit if the coords are the portion of width and high
+            x, y, x2, y2 = prepro.obj_box_coord_scale_to_pixelunit([x, y, x2, y2], (imh, imw))
+
+        cv2.rectangle(image,
+            (int(x), int(y)), (int(x2), int(y2)),   # up-left and botton-right
+            [0,255,0],
+            thick)
+
+        cv2.putText(
+            image,
+            classes_list[classes[i]] + ((" %.2f" % (scores[i])) if (len(scores) != 0) else " "),
+            (int(x), int(y)),   # button left
+            0,
+            1.5e-3 * imh,       # bigger = larger font
+            [0,0,256],          # self.meta['colors'][max_indx],
+            int(thick/2)+1)     # bold
+
+    if save_name is not None:
+        # cv2.imwrite('_my.png', image)
+        save_image(image, save_name)
+    # if len(coords) == 0:
+    #     print("draw_boxes_and_labels_to_image: no bboxes exist, cannot draw !")
+    return image
+
+# old APIs
 def W(W=None, second=10, saveable=True, shape=[28,28], name='mnist', fig_idx=2396512):
     """Visualize every columns of the weight matrix to a group of Greyscale img.
 
@@ -75,6 +181,7 @@ def W(W=None, second=10, saveable=True, shape=[28,28], name='mnist', fig_idx=239
     --------
     >>> tl.visualize.W(network.all_params[0].eval(), second=10, saveable=True, name='weight_of_1st_layer', fig_idx=2012)
     """
+    import matplotlib.pyplot as plt
     if saveable is False:
         plt.ion()
     fig = plt.figure(fig_idx)      # show all feature images
@@ -138,6 +245,7 @@ def frame(I=None, second=5, saveable=True, name='frame', cmap=None, fig_idx=1283
     >>> observation = env.reset()
     >>> tl.visualize.frame(observation)
     """
+    import matplotlib.pyplot as plt
     if saveable is False:
         plt.ion()
     fig = plt.figure(fig_idx)      # show all feature images
@@ -176,6 +284,7 @@ def CNN2d(CNN=None, second=10, saveable=True, name='cnn', fig_idx=3119362):
     --------
     >>> tl.visualize.CNN2d(network.all_params[0].eval(), second=10, saveable=True, name='cnn1_mnist', fig_idx=2012)
     """
+    import matplotlib.pyplot as plt
     # print(CNN.shape)    # (5, 5, 3, 64)
     # exit()
     n_mask = CNN.shape[3]
@@ -216,7 +325,6 @@ def CNN2d(CNN=None, second=10, saveable=True, name='cnn', fig_idx=3119362):
         plt.draw()
         plt.pause(second)
 
-
 def images2d(images=None, second=10, saveable=True, name='images', dtype=None,
                                                             fig_idx=3119362):
     """Display a group of RGB or Greyscale images.
@@ -241,6 +349,7 @@ def images2d(images=None, second=10, saveable=True, name='images', dtype=None,
     >>> X_train, y_train, X_test, y_test = tl.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=False)
     >>> tl.visualize.images2d(X_train[0:100,:,:,:], second=10, saveable=False, name='cifar10', dtype=np.uint8, fig_idx=20212)
     """
+    import matplotlib.pyplot as plt
     # print(images.shape)    # (50000, 32, 32, 3)
     # exit()
     if dtype:
@@ -311,6 +420,7 @@ def tsne_embedding(embeddings, reverse_dictionary, plot_only=500,
     >>> tl.visualize.tsne_embedding(final_embeddings, labels, reverse_dictionary,
     ...                   plot_only=500, second=5, saveable=False, name='tsne')
     """
+    import matplotlib.pyplot as plt
     def plot_with_labels(low_dim_embs, labels, figsize=(18, 18), second=5,
                                     saveable=True, name='tsne', fig_idx=9862):
         assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"