diff --git a/neuralmonkey/decoders/decoder.py b/neuralmonkey/decoders/decoder.py
index f94b16951..fcdbc476e 100644
--- a/neuralmonkey/decoders/decoder.py
+++ b/neuralmonkey/decoders/decoder.py
@@ -1,7 +1,7 @@
 # pylint: disable=too-many-lines
 import math
-from typing import (cast, Iterable, List, Callable, Optional,
-                    Any, Tuple, NamedTuple, Union)
+from typing import (cast, Iterable, List, Callable, Optional, Any, Tuple,
+                    NamedTuple)
 
 import numpy as np
 import tensorflow as tf
@@ -13,19 +13,20 @@
                                      PAD_TOKEN_INDEX)
 from neuralmonkey.model.model_part import ModelPart, FeedDict
 from neuralmonkey.model.sequence import EmbeddedSequence
-from neuralmonkey.model.stateful import (TemporalStatefulWithOutput,
-                                         SpatialStatefulWithOutput)
+from neuralmonkey.model.stateful import Stateful
 from neuralmonkey.logging import log, warn
-from neuralmonkey.nn.ortho_gru_cell import OrthoGRUCell
+from neuralmonkey.nn.ortho_gru_cell import OrthoGRUCell, NematusGRUCell
 from neuralmonkey.nn.utils import dropout
 from neuralmonkey.decoders.encoder_projection import (
-    linear_encoder_projection, concat_encoder_projection, empty_initial_state)
+    linear_encoder_projection, concat_encoder_projection, empty_initial_state,
+    EncoderProjection)
 from neuralmonkey.decoders.output_projection import (OutputProjectionSpec,
                                                      nonlinear_output)
 from neuralmonkey.decorators import tensor
 
 
 RNN_CELL_TYPES = {
+    "NematusGRU": NematusGRUCell,
     "GRU": OrthoGRUCell,
     "LSTM": tf.contrib.rnn.LSTMCell
 }
@@ -62,9 +63,7 @@ class Decoder(ModelPart):
     # pylint: disable=too-many-locals
     # pylint: disable=too-many-arguments,too-many-branches,too-many-statements
     def __init__(self,
-                 # TODO only stateful, attention will need temporal or spat.
-                 encoders: List[Union[TemporalStatefulWithOutput,
-                                      SpatialStatefulWithOutput]],
+                 encoders: List[Stateful],
                  vocabulary: Vocabulary,
                  data_id: str,
                  name: str,
@@ -73,9 +72,7 @@ def __init__(self,
                  rnn_size: int = None,
                  embedding_size: int = None,
                  output_projection: OutputProjectionSpec = None,
-                 encoder_projection: Callable[
-                     [tf.Tensor, Optional[int], Optional[List[Any]]],
-                     tf.Tensor]=None,
+                 encoder_projection: EncoderProjection = None,
                  attentions: List[BaseAttention] = None,
                  embeddings_source: EmbeddedSequence = None,
                  attention_on_input: bool = True,
@@ -164,7 +161,8 @@ def __init__(self,
         assert self.rnn_size is not None
 
         if self._rnn_cell_str not in RNN_CELL_TYPES:
-            raise ValueError("RNN cell must be a either 'GRU' or 'LSTM'")
+            raise ValueError("RNN cell must be a either 'GRU', 'LSTM', or "
+                             "'NematusGRU'. Not {}".format(self._rnn_cell_str))
 
         if self.output_projection_spec is None:
             log("No output projection specified - using tanh projection")
@@ -365,7 +363,11 @@ def _get_rnn_cell(self) -> tf.contrib.rnn.RNNCell:
         return RNN_CELL_TYPES[self._rnn_cell_str](self.rnn_size)
 
     def _get_conditional_gru_cell(self) -> tf.contrib.rnn.GRUCell:
-        return tf.contrib.rnn.GRUCell(self.rnn_size)
+        if self._rnn_cell_str == "NematusGRU":
+            return NematusGRUCell(
+                self.rnn_size, use_state_bias=True, use_input_bias=False)
+
+        return RNN_CELL_TYPES[self._rnn_cell_str](self.rnn_size)
 
     def embed_input_symbol(self, *args) -> tf.Tensor:
         loop_state = LoopState(*args)
@@ -403,16 +405,17 @@ def body(*args) -> LoopState:
 
                 # Run the RNN.
                 cell = self._get_rnn_cell()
-                if self._rnn_cell_str == "GRU":
-                    cell_output, state = cell(rnn_input,
-                                              loop_state.prev_rnn_output)
-                    next_state = state
+                if self._rnn_cell_str in ["GRU", "NematusGRU"]:
+                    cell_output, next_state = cell(
+                        rnn_input, loop_state.prev_rnn_output)
+
                     attns = [
                         a.attention(cell_output, loop_state.prev_rnn_output,
                                     rnn_input, att_loop_state, loop_state.step)
                         for a, att_loop_state in zip(
                             self.attentions,
                             loop_state.attention_loop_states)]
+
                     if self.attentions:
                         contexts, att_loop_states = zip(*attns)
                     else:
@@ -421,8 +424,9 @@ def body(*args) -> LoopState:
                     if self._conditional_gru:
                         cell_cond = self._get_conditional_gru_cell()
                         cond_input = tf.concat(contexts, -1)
-                        cell_output, state = cell_cond(cond_input, state,
-                                                       scope="cond_gru_2_cell")
+                        cell_output, next_state = cell_cond(
+                            cond_input, next_state, scope="cond_gru_2_cell")
+
                 elif self._rnn_cell_str == "LSTM":
                     prev_state = tf.contrib.rnn.LSTMStateTuple(
                         loop_state.prev_rnn_state, loop_state.prev_rnn_output)
diff --git a/neuralmonkey/decoders/encoder_projection.py b/neuralmonkey/decoders/encoder_projection.py
index 817da2192..36b55092e 100644
--- a/neuralmonkey/decoders/encoder_projection.py
+++ b/neuralmonkey/decoders/encoder_projection.py
@@ -2,40 +2,47 @@
 
 This module contains different variants of projection of encoders into the
 initial state of the decoder.
-"""
 
+Encoder projections are specified in the configuration file.  Each encoder
+projection function has a unified type ``EncoderProjection``, which is a
+callable that takes three arguments:
+
+1. ``train_mode`` -- boolean tensor specifying whether the train mode is on
+2. ``rnn_size`` -- the size of the resulting initial state
+3. ``encoders`` -- a list of ``Stateful`` objects used as the encoders.
 
-from typing import List, Optional, Callable
+To enable further parameterization of encoder projection functions, one can
+use higher-order functions.
+"""
+from typing import List, Callable, cast
 
 import tensorflow as tf
+from typeguard import check_argument_types
 
-from neuralmonkey.model.stateful import Stateful
+from neuralmonkey.model.stateful import Stateful, TemporalStatefulWithOutput
 from neuralmonkey.nn.utils import dropout
 from neuralmonkey.logging import log
 
 
+# pylint: disable=invalid-name
+EncoderProjection = Callable[
+    [tf.Tensor, int, List[Stateful]], tf.Tensor]
+# pylint: enable=invalid-name
+
+
 # pylint: disable=unused-argument
 # The function must conform the API
 def empty_initial_state(train_mode: tf.Tensor,
-                        rnn_size: Optional[int],
+                        rnn_size: int,
                         encoders: List[Stateful] = None) -> tf.Tensor:
-    """Return an empty vector.
-
-    Arguments:
-        train_mode: tf 0-D bool Tensor specifying the training mode (not used)
-        rnn_size: The size of the resulting vector
-        encoders: The list of encoders (not used)
-    """
+    """Return an empty vector."""
     if rnn_size is None:
-        raise ValueError("You must supply rnn_size for this type of "
-                         "encoder projection")
+        raise ValueError(
+            "You must supply rnn_size for this type of encoder projection")
     return tf.zeros([rnn_size])
 
 
-def linear_encoder_projection(
-        dropout_keep_prob: float) -> Callable[
-            [tf.Tensor, Optional[int], Optional[List[Stateful]]],
-            tf.Tensor]:
+def linear_encoder_projection(dropout_keep_prob: float) -> EncoderProjection:
     """Return a linear encoder projection.
 
     Return a projection function which applies dropout on concatenated
@@ -45,61 +52,82 @@ def linear_encoder_projection(
     Arguments:
         dropout_keep_prob: The dropout keep probability
     """
+    check_argument_types()
+
     def func(train_mode: tf.Tensor,
-             rnn_size: Optional[int] = None,
-             encoders: Optional[List[Stateful]] = None) -> tf.Tensor:
-        """Linearly project encoders' encoded value.
-
-        Linearly project encoders' encoded value to rnn_size
-        and apply dropout.
-
-        Arguments:
-            train_mode: tf 0-D bool Tensor specifying the training mode
-            rnn_size: The size of the resulting vector
-            encoders: The list of encoders
-        """
-        if rnn_size is None:
-            raise ValueError("You must supply rnn_size for this type of "
-                             "encoder projection")
+             rnn_size: int,
+             encoders: List[Stateful]) -> tf.Tensor:
 
-        if encoders is None or not encoders:
-            raise ValueError("There must be at least one encoder for this type"
-                             " of encoder projection")
+        if rnn_size is None:
+            raise ValueError(
+                "You must supply rnn_size for this type of encoder projection")
 
-        encoded_concat = tf.concat([e.output for e in encoders], 1)
-        encoded_concat = dropout(
-            encoded_concat, dropout_keep_prob, train_mode)
+        en_concat = concat_encoder_projection(train_mode, None, encoders)
+        en_concat = dropout(en_concat, dropout_keep_prob, train_mode)
 
-        return tf.layers.dense(encoded_concat, rnn_size,
-                               name="encoders_projection")
+        return tf.layers.dense(en_concat, rnn_size, name="encoders_projection")
 
-    return func
+    return cast(EncoderProjection, func)
 
 
 def concat_encoder_projection(
         train_mode: tf.Tensor,
-        rnn_size: Optional[int] = None,
-        encoders: Optional[List[Stateful]] = None) -> tf.Tensor:
-    """Create the initial state by concatenating the encoders' encoded values.
+        rnn_size: int = None,
+        encoders: List[Stateful] = None) -> tf.Tensor:
+    """Concatenate the encoded values of the encoders."""
 
-    Arguments:
-        train_mode: tf 0-D bool Tensor specifying the training mode (not used)
-        rnn_size: The size of the resulting vector (not used)
-        encoders: The list of encoders
-    """
     if encoders is None or not encoders:
         raise ValueError("There must be at least one encoder for this type "
                          "of encoder projection")
 
-    if rnn_size is not None:
-        assert rnn_size == sum(e.output.get_shape()[1].value
-                               for e in encoders)
-
-    encoded_concat = tf.concat([e.output for e in encoders], 1)
+    output_size = sum(e.output.get_shape()[1].value for e in encoders)
+    if rnn_size is not None and rnn_size != output_size:
+        raise ValueError("RNN size supplied for concat projection ({}) does "
+                         "not match the size of the concatenated vectors ({})."
+                         .format(rnn_size, output_size))
 
-    # pylint: disable=no-member
     log("The inferred rnn_size of this encoder projection will be {}"
-        .format(encoded_concat.get_shape()[1].value))
-    # pylint: enable=no-member
+        .format(output_size))
 
+    encoded_concat = tf.concat([e.output for e in encoders], 1)
     return encoded_concat
+
+
+def nematus_projection(dropout_keep_prob: float = 1.0) -> EncoderProjection:
+    """Return encoder projection used in Nematus.
+
+    The initial state is a dense projection with tanh activation computed on
+    the averaged states of the encoders. Dropout is applied to the means
+    (before the projection).
+
+    Arguments:
+        dropout_keep_prob: The dropout keep probability.
+    """
+    check_argument_types()
+
+    def func(
+            train_mode: tf.Tensor,
+            rnn_size: int,
+            encoders: List[TemporalStatefulWithOutput]) -> tf.Tensor:
+
+        if len(encoders) != 1:
+            raise ValueError("Exactly one encoder required for this type of "
+                             "projection. {} given.".format(len(encoders)))
+        encoder = encoders[0]
+
+        # shape (batch, time)
+        masked_sum = tf.reduce_sum(
+            encoder.temporal_states
+            * tf.expand_dims(encoder.temporal_mask, 2), 1)
+
+        # shape (batch, 1)
+        lengths = tf.reduce_sum(encoder.temporal_mask, 1, keep_dims=True)
+
+        means = masked_sum / lengths
+        means = dropout(means, dropout_keep_prob, train_mode)
+
+        return tf.layers.dense(means, rnn_size,
+                               activation=tf.tanh,
+                               name="encoders_projection")
+
+    return cast(EncoderProjection, func)
diff --git a/neuralmonkey/decoders/output_projection.py b/neuralmonkey/decoders/output_projection.py
index 49cfe57cc..6e787ea35 100644
--- a/neuralmonkey/decoders/output_projection.py
+++ b/neuralmonkey/decoders/output_projection.py
@@ -1,10 +1,25 @@
-"""Module with different variants of projection functions for RNN outputs."""
+"""Output Projection Module.
 
+This module contains different variants of projection functions of decoder
+outputs into the logit function inputs.
+
+Output projections are specified in the configuration file. Each output
+projection function has a unified type ``OutputProjection``, which is a
+callable that takes four arguments and returns a tensor:
+
+1. ``prev_state`` -- the hidden state of the decoder.
+2. ``prev_output`` -- embedding of the previously decoded word (or train input)
+3. ``ctx_tensots`` -- a list of context vectors (for each attention object)
+
+To enable further parameterization of output projection functions, one can
+use higher-order functions.
+"""
 from typing import Union, Tuple, List, Callable
 import tensorflow as tf
 from typeguard import check_argument_types
 
 from neuralmonkey.nn.projection import multilayer_projection, maxout
+from neuralmonkey.nn.utils import dropout
 
 
 # pylint: disable=invalid-name
@@ -57,6 +72,35 @@ def _projection(prev_state, prev_output, ctx_tensors, train_mode):
     return _projection, output_size
 
 
+def nematus_output(
+        output_size: int,
+        dropout_keep_prob: float = 1.0) -> Tuple[OutputProjection, int]:
+    """Apply nonlinear one-hidden-layer deep output.
+
+    Implementation consistent with Nematus.
+    Can be used instead of (and is in theory equivalent to) nonlinear_output.
+
+    Projects the RNN state, embedding of the previously outputted word, and
+    concatenation of all context vectors into a shared vector space, sums them
+    up and apply a hyperbolic tangent activation function.
+    """
+    check_argument_types()
+
+    def _projection(prev_state, prev_output, ctx_tensors, train_mode):
+        prev_state = dropout(prev_state, dropout_keep_prob, train_mode)
+        prev_output = dropout(prev_output, dropout_keep_prob, train_mode)
+        ctx_concat = tf.concat(ctx_tensors, 1)
+        ctx = dropout(ctx_concat, dropout_keep_prob, train_mode)
+
+        logit_rnn = tf.layers.dense(prev_state, output_size, name="rnn_state")
+        logit_emb = tf.layers.dense(prev_output, output_size, name="prev_out")
+        logit_ctx = tf.layers.dense(ctx, output_size, name="context")
+
+        return tf.tanh(logit_rnn + logit_emb + logit_ctx)
+
+    return _projection, output_size
+
+
 def nonlinear_output(
         output_size: int,
         activation_fn: Callable[[tf.Tensor], tf.Tensor] = tf.tanh
diff --git a/neuralmonkey/encoders/recurrent.py b/neuralmonkey/encoders/recurrent.py
index 4978b3e8b..31890fbdd 100644
--- a/neuralmonkey/encoders/recurrent.py
+++ b/neuralmonkey/encoders/recurrent.py
@@ -5,7 +5,7 @@
 
 from neuralmonkey.model.stateful import TemporalStatefulWithOutput
 from neuralmonkey.model.model_part import ModelPart, FeedDict
-from neuralmonkey.nn.ortho_gru_cell import OrthoGRUCell
+from neuralmonkey.nn.ortho_gru_cell import OrthoGRUCell, NematusGRUCell
 from neuralmonkey.nn.utils import dropout
 from neuralmonkey.vocabulary import Vocabulary
 from neuralmonkey.dataset import Dataset
@@ -18,6 +18,7 @@
 # pylint: enable=invalid-name
 
 RNN_CELL_TYPES = {
+    "NematusGRU": NematusGRUCell,
     "GRU": OrthoGRUCell,
     "LSTM": tf.contrib.rnn.LSTMCell
 }
@@ -52,7 +53,8 @@ def __init__(self,
             raise ValueError("Dropout keep prob must be inside (0,1].")
 
         if self.rnn_cell_str not in RNN_CELL_TYPES:
-            raise ValueError("RNN cell must be a either 'GRU' or 'LSTM'")
+            raise ValueError("RNN cell must be a either 'GRU', 'LSTM', or "
+                             "'NematusGRU'. Not {}".format(self.rnn_cell_str))
 
         if output_size is not None:
             if output_size <= 0:
@@ -63,7 +65,6 @@ def __init__(self,
         else:
             self._project_final_state = False
             self.output_size = 2 * rnn_size
-
     # pylint: enable=too-many-arguments
 
     # pylint: disable=no-self-use
@@ -97,7 +98,7 @@ def temporal_states(self) -> tf.Tensor:
     @tensor
     def output(self) -> tf.Tensor:
         # pylint: disable=unsubscriptable-object
-        if self.rnn_cell_str == "GRU":
+        if self.rnn_cell_str in ["GRU", "NematusGRU"]:
             output = tf.concat(self.bidirectional_rnn[1], 1)
         elif self.rnn_cell_str == "LSTM":
             # TODO is "h" what we want?
diff --git a/neuralmonkey/model/sequence.py b/neuralmonkey/model/sequence.py
index 220f9c59b..469a17664 100644
--- a/neuralmonkey/model/sequence.py
+++ b/neuralmonkey/model/sequence.py
@@ -86,12 +86,15 @@ def lengths(self) -> tf.Tensor:
 class EmbeddedFactorSequence(Sequence):
     """A `Sequence` that stores one or more embedded inputs (factors)."""
 
+    # pylint: disable=too-many-arguments
     def __init__(self,
                  name: str,
                  vocabularies: List[Vocabulary],
                  data_ids: List[str],
                  embedding_sizes: List[int],
                  max_length: int = None,
+                 add_start_symbol: bool = False,
+                 add_end_symbol: bool = False,
                  save_checkpoint: str = None,
                  load_checkpoint: str = None) -> None:
         """Construct a new instance of `EmbeddedFactorSequence`.
@@ -109,6 +112,8 @@ def __init__(self,
             embedding_sizes: A list of integers specifying the size of the
                 embedding vector for each factor
             max_length: The maximum length of the sequences
+            add_start_symbol: Includes <s> in the sequence
+            add_end_symbol: Includes </s> in the sequence
             save_checkpoint: The save_checkpoint parameter for `ModelPart`
             load_checkpoint: The load_checkpoint parameter for `ModelPart`
         """
@@ -120,6 +125,8 @@ def __init__(self,
         self.vocabulary_sizes = [len(vocab) for vocab in self.vocabularies]
         self.data_ids = data_ids
         self.embedding_sizes = embedding_sizes
+        self.add_start_symbol = add_start_symbol
+        self.add_end_symbol = add_end_symbol
 
         if not (len(self.data_ids)
                 == len(self.vocabularies)
@@ -129,6 +136,7 @@ def __init__(self,
 
         if any([esize <= 0 for esize in self.embedding_sizes]):
             raise ValueError("Embedding size must be a positive integer.")
+    # pylint: enable=too-many-arguments
 
     # TODO this should be placed into the abstract embedding class
     def tb_embedding_visualization(self, logdir: str,
@@ -234,7 +242,8 @@ def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
             factors = dataset.get_series(name)
             vectors, paddings = vocabulary.sentences_to_tensor(
                 list(factors), self.max_length, pad_to_max_len=False,
-                train_mode=train)
+                train_mode=train, add_start_symbol=self.add_start_symbol,
+                add_end_symbol=self.add_end_symbol)
 
             fd[factor_plc] = list(zip(*vectors))
 
@@ -252,12 +261,15 @@ def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict:
 class EmbeddedSequence(EmbeddedFactorSequence):
     """A sequence of embedded inputs (for a single factor)."""
 
+    # pylint: disable=too-many-arguments
     def __init__(self,
                  name: str,
                  vocabulary: Vocabulary,
                  data_id: str,
                  embedding_size: int,
                  max_length: int = None,
+                 add_start_symbol: bool = False,
+                 add_end_symbol: bool = False,
                  save_checkpoint: str = None,
                  load_checkpoint: str = None) -> None:
         """Construct a new instance of `EmbeddedSequence`.
@@ -270,6 +282,8 @@ def __init__(self,
             embedding_sizes: An integer that specifies the size of the
                 embedding vector for the sequence data
             max_length: The maximum length of the sequences
+            add_start_symbol: Includes <s> in the sequence
+            add_end_symbol: Includes </s> in the sequence
             save_checkpoint: The save_checkpoint parameter for `ModelPart`
             load_checkpoint: The load_checkpoint parameter for `ModelPart`
         """
@@ -280,8 +294,11 @@ def __init__(self,
             data_ids=[data_id],
             embedding_sizes=[embedding_size],
             max_length=max_length,
+            add_start_symbol=add_start_symbol,
+            add_end_symbol=add_end_symbol,
             save_checkpoint=save_checkpoint,
             load_checkpoint=load_checkpoint)
+    # pylint: enable=too-many-arguments
 
     # pylint: disable=unsubscriptable-object
     @property
diff --git a/neuralmonkey/nn/ortho_gru_cell.py b/neuralmonkey/nn/ortho_gru_cell.py
index 3d0e5c53b..4e74f8823 100644
--- a/neuralmonkey/nn/ortho_gru_cell.py
+++ b/neuralmonkey/nn/ortho_gru_cell.py
@@ -5,7 +5,65 @@
 class OrthoGRUCell(tf.contrib.rnn.GRUCell):
     """Classic GRU cell but initialized using random orthogonal matrices."""
 
-    def __call__(self, inputs, state, scope=None):
-        with tf.variable_scope(scope or "OrthoGRUCell") as vscope:
-            vscope.set_initializer(tf.orthogonal_initializer())
-            return super().__call__(inputs, state, vscope)
+    def __init__(self,
+                 num_units,
+                 activation=None,
+                 reuse=None,
+                 bias_initializer=None):
+        tf.contrib.rnn.GRUCell.__init__(
+            self, num_units, activation, reuse, tf.orthogonal_initializer(),
+            bias_initializer)
+
+    def __call__(self, inputs, state, scope="OrthoGRUCell"):
+        return tf.contrib.rnn.GRUCell.__call__(self, inputs, state, scope)
+
+
+# Note that tensorflow does not like when the type annotations are present.
+class NematusGRUCell(tf.contrib.rnn.GRUCell):
+    """Nematus implementation of gated recurrent unit cell.
+
+    The main difference is the order in which the gating functions and linear
+    projections are applied to the hidden state.
+
+    The math is equivalent, in practice there are differences due to float
+    precision errors.
+    """
+
+    def __init__(self, rnn_size, use_state_bias=False, use_input_bias=True):
+        self.use_state_bias = use_state_bias
+        self.use_input_bias = use_input_bias
+
+        tf.contrib.rnn.GRUCell.__init__(self, rnn_size)
+
+    def call(self, inputs, state):
+        """Gated recurrent unit (GRU) with nunits cells."""
+        with tf.variable_scope("gates"):
+            input_to_gates = tf.layers.dense(
+                inputs, 2 * self._num_units, name="input_proj",
+                use_bias=self.use_input_bias)
+
+            # Nematus does the orthogonal initialization probably differently
+            state_to_gates = tf.layers.dense(
+                state, 2 * self._num_units,
+                use_bias=self.use_state_bias,
+                kernel_initializer=tf.orthogonal_initializer(),
+                name="state_proj")
+
+            gates_input = state_to_gates + input_to_gates
+            reset, update = tf.split(
+                tf.sigmoid(gates_input), num_or_size_splits=2, axis=1)
+
+        with tf.variable_scope("candidate"):
+            input_to_candidate = tf.layers.dense(
+                inputs, self._num_units, use_bias=self.use_input_bias,
+                name="input_proj")
+
+            state_to_candidate = tf.layers.dense(
+                state, self._num_units, use_bias=self.use_state_bias,
+                name="state_proj")
+
+            candidate = self._activation(
+                state_to_candidate * reset + input_to_candidate)
+
+        new_state = update * state + (1 - update) * candidate
+        return new_state, new_state
diff --git a/neuralmonkey/vocabulary.py b/neuralmonkey/vocabulary.py
index de36afa9f..b05d77cbc 100644
--- a/neuralmonkey/vocabulary.py
+++ b/neuralmonkey/vocabulary.py
@@ -6,6 +6,7 @@
 # pylint: disable=too-many-lines
 
 import collections
+import json
 import os
 import random
 
@@ -110,6 +111,34 @@ def from_wordlist(path: str,
     return vocabulary
 
 
+def from_nematus_json(path: str, max_size: int = None,
+                      pad_to_max_size: bool = False) -> "Vocabulary":
+    """Load vocabulary from Nematus JSON format.
+
+    The JSON format is a flat dictionary that maps words to their index in the
+    vocabulary.
+
+    Args:
+        path: Path to the file.
+        max_size: Maximum vocabulary size including 'unk' and 'eos' symbols,
+            but not including <pad> and <s> symbol.
+    """
+    with open(path, "r", encoding="utf-8") as f_json:
+        contents = json.load(f_json)
+
+    vocabulary = Vocabulary()
+    for word in sorted(contents.keys(), key=lambda x: contents[x])[2:max_size]:
+        vocabulary.add_word(word)
+
+    if pad_to_max_size:
+        current_length = len(vocabulary)
+        for i in range(max_size - current_length + 2):  # the "2" is ugly HACK
+            word = "<pad_{}>".format(i)
+            vocabulary.add_word(word)
+
+    return vocabulary
+
+
 # pylint: disable=too-many-arguments
 # helper function, this number of parameters is needed
 def from_dataset(datasets: List[Dataset], series_ids: List[str], max_size: int,
diff --git a/scripts/import_nematus.py b/scripts/import_nematus.py
new file mode 100755
index 000000000..7efcc371c
--- /dev/null
+++ b/scripts/import_nematus.py
@@ -0,0 +1,485 @@
+#!/usr/bin/env python3
+"""Imports nematus model file and convert it into a neural monkey experiment
+given a neural monkey configuration file.
+"""
+from typing import Dict, Tuple, List
+import argparse
+import json
+import os
+import numpy as np
+import tensorflow as tf
+
+from neuralmonkey.config.parsing import parse_file
+from neuralmonkey.config.builder import build_config
+from neuralmonkey.attention.feed_forward import Attention
+from neuralmonkey.encoders.recurrent import RecurrentEncoder
+from neuralmonkey.decoders.decoder import Decoder
+from neuralmonkey.decoders.encoder_projection import nematus_projection
+from neuralmonkey.decoders.output_projection import nematus_output
+from neuralmonkey.model.sequence import EmbeddedSequence
+from neuralmonkey.vocabulary import from_nematus_json
+from neuralmonkey.logging import log as _log
+
+
+def log(message: str, color: str = "blue") -> None:
+    _log(message, color)
+
+
+def check_shape(var1_tf: tf.Variable, var2_np: np.ndarray):
+    if var1_tf.get_shape().as_list() != list(var2_np.shape):
+        log("Shapes do not match! Exception will follow.", color="red")
+
+
+# Here come a few functions that fiddle with the Nematus parameters in order to
+# fit them to Neural Monkey parameter shapes.
+def emb_fix_dim1(variables: List[np.ndarray]) -> np.ndarray:
+    return emb_fix(variables, dim=1)
+
+
+def emb_fix(variables: List[np.ndarray], dim: int = 0) -> np.ndarray:
+    """Process nematus tensors with vocabulary dimension.
+
+    Nematus uses only two special symbols, eos and UNK. For embeddings of start
+    and pad tokens, we use zero vectors, inserted to the correct position in
+    the parameter matrix.
+
+    Arguments:
+        variables: the list of variables. Must be of length 1.
+        dim: The vocabulary dimension.
+    """
+    if len(variables) != 1:
+        raise ValueError("VocabFix only works with single vars. {} given."
+                         .format(len(variables)))
+
+    if dim != 0 and dim != 1:
+        raise ValueError("dim can only be 0 or 1. is: {}".format(dim))
+
+    variable = variables[0]
+    shape = variable.shape
+
+    # insert start token (hack from nematus - last from vocab - does it work? NO)
+    # to_insert = np.squeeze(variable[-1] if dim == 0 else variable[:, -1])
+    to_insert = np.zeros(shape[1 - dim]) if len(shape) > 1 else 0.
+    variable = np.insert(variable, 0, to_insert, axis=dim)
+
+    # insert padding token
+    to_insert = np.zeros(shape[1 - dim]) if len(shape) > 1 else 0.
+    variable = np.insert(variable, 0, to_insert, axis=dim)
+
+    return variable
+
+
+def sum_vars(variables: List[np.ndarray]) -> np.ndarray:
+    return sum(variables)
+
+
+def concat_vars(variables: List[np.ndarray]) -> np.ndarray:
+    return np.concatenate(variables)
+
+
+def squeeze(variables: List[np.ndarray]) -> np.ndarray:
+    if len(variables) != 1:
+        raise ValueError("Squeeze only works with single vars. {} given."
+                         .format(len(variables)))
+    return np.squeeze(variables[0])
+
+# pylint: disable=line-too-long
+# No point in line wrapping
+VARIABLE_MAP = {
+    "encoder_input/embedding_matrix_0": (["Wemb"], emb_fix),
+    "decoder/word_embeddings": (["Wemb_dec"], emb_fix),
+    "decoder/state_to_word_W": (["ff_logit_W"], emb_fix_dim1),
+    "decoder/state_to_word_b": (["ff_logit_b"], emb_fix),
+    "encoder/bidirectional_rnn/fw/OrthoGRUCell/gates/state_proj/kernel": (["encoder_U"], None),
+    "encoder/bidirectional_rnn/fw/OrthoGRUCell/gates/input_proj/kernel": (["encoder_W"], None),
+    "encoder/bidirectional_rnn/fw/OrthoGRUCell/gates/input_proj/bias": (["encoder_b"], None),
+    "encoder/bidirectional_rnn/fw/OrthoGRUCell/candidate/state_proj/kernel": (["encoder_Ux"], None),
+    "encoder/bidirectional_rnn/fw/OrthoGRUCell/candidate/input_proj/kernel": (["encoder_Wx"], None),
+    "encoder/bidirectional_rnn/fw/OrthoGRUCell/candidate/input_proj/bias": (["encoder_bx"], None),
+    "encoder/bidirectional_rnn/bw/OrthoGRUCell/gates/state_proj/kernel": (["encoder_r_U"], None),
+    "encoder/bidirectional_rnn/bw/OrthoGRUCell/gates/input_proj/kernel": (["encoder_r_W"], None),
+    "encoder/bidirectional_rnn/bw/OrthoGRUCell/gates/input_proj/bias": (["encoder_r_b"], None),
+    "encoder/bidirectional_rnn/bw/OrthoGRUCell/candidate/state_proj/kernel": (["encoder_r_Ux"], None),
+    "encoder/bidirectional_rnn/bw/OrthoGRUCell/candidate/input_proj/kernel": (["encoder_r_Wx"], None),
+    "encoder/bidirectional_rnn/bw/OrthoGRUCell/candidate/input_proj/bias": (["encoder_r_bx"], None),
+    "decoder/initial_state/encoders_projection/kernel": (["ff_state_W"], None),
+    "decoder/initial_state/encoders_projection/bias": (["ff_state_b"], None),
+    "decoder/attention_decoder/OrthoGRUCell/gates/state_proj/kernel": (["decoder_U"], None),
+    "decoder/attention_decoder/OrthoGRUCell/gates/input_proj/kernel": (["decoder_W"], None),
+    "decoder/attention_decoder/OrthoGRUCell/gates/input_proj/bias": (["decoder_b"], None),
+    "decoder/attention_decoder/OrthoGRUCell/candidate/state_proj/kernel": (["decoder_Ux"], None),
+    "decoder/attention_decoder/OrthoGRUCell/candidate/input_proj/kernel": (["decoder_Wx"], None),
+    "decoder/attention_decoder/OrthoGRUCell/candidate/input_proj/bias": (["decoder_bx"], None),
+    "attention/attn_key_projection": (["decoder_Wc_att"], None),
+    "attention/attn_projection_bias": (["decoder_b_att"], None),
+    "attention/Attention/attn_query_projection": (["decoder_W_comb_att"], None),
+    "attention/attn_similarity_v": (["decoder_U_att"], squeeze),
+    "attention/attn_bias": (["decoder_c_tt"], squeeze),
+    "decoder/attention_decoder/cond_gru_2_cell/gates/state_proj/kernel": (["decoder_U_nl"], None),
+    "decoder/attention_decoder/cond_gru_2_cell/gates/input_proj/kernel": (["decoder_Wc"], None),
+    "decoder/attention_decoder/cond_gru_2_cell/gates/state_proj/bias": (["decoder_b_nl"], None),
+    "decoder/attention_decoder/cond_gru_2_cell/candidate/state_proj/kernel": (["decoder_Ux_nl"], None),
+    "decoder/attention_decoder/cond_gru_2_cell/candidate/input_proj/kernel": (["decoder_Wcx"], None),
+    "decoder/attention_decoder/cond_gru_2_cell/candidate/state_proj/bias": (["decoder_bx_nl"], None),
+    "decoder/attention_decoder/rnn_state/kernel": (["ff_logit_lstm_W"], None),
+    "decoder/attention_decoder/rnn_state/bias": (["ff_logit_lstm_b"], None),
+    "decoder/attention_decoder/prev_out/kernel": (["ff_logit_prev_W"], None),
+    "decoder/attention_decoder/prev_out/bias": (["ff_logit_prev_b"], None),
+    "decoder/attention_decoder/context/kernel": (["ff_logit_ctx_W"], None),
+    "decoder/attention_decoder/context/bias": (["ff_logit_ctx_b"], None)
+}
+# pylint: enable=line-too-long
+
+ENCODER_NAME = "encoder"
+DECODER_NAME = "decoder"
+ATTENTION_NAME = "attention"
+
+
+def load_nematus_json(path: str) -> Dict:
+    with open(path, "r", encoding="utf-8") as f_json:
+        contents = json.load(f_json)
+
+    prefix = os.path.realpath(os.path.dirname(path))
+    config = {
+        "encoder_type": contents["encoder"],
+        "decoder_type": contents["decoder"],
+        "n_words_src": contents["n_words_src"],
+        "n_words_tgt": contents["n_words"],
+        "variables_file": contents["saveto"],
+        "rnn_size": contents["dim"],
+        "embedding_size": contents["dim_word"],
+        "src_vocabulary": os.path.join(
+            prefix, contents["dictionaries"][0]),
+        "tgt_vocabulary": os.path.join(
+            prefix, contents["dictionaries"][1]),
+        "max_length": contents["maxlen"]
+    }
+
+    if config["encoder_type"] != "gru":
+        raise ValueError("Unsupported encoder type: {}"
+                         .format(config["encoder_type"]))
+
+    if config["decoder_type"] != "gru_cond":
+        raise ValueError("Unsupported decoder type: {}"
+                         .format(config["decoder_type"]))
+
+    if not os.path.isfile(config["src_vocabulary"]):
+        raise FileNotFoundError("Vocabulary file not found: {}"
+                                .format(config["src_vocabulary"]))
+
+    if not os.path.isfile(config["tgt_vocabulary"]):
+        raise FileNotFoundError("Vocabulary file not found: {}"
+                                .format(config["tgt_vocabulary"]))
+
+    return config
+
+
+VOCABULARY_TEMPLATE = """\
+[vocabulary_{}]
+class=vocabulary.from_nematus_json
+path="{}"
+max_size={}
+pad_to_max_size=True
+"""
+
+ENCODER_TEMPLATE = """\
+[encoder]
+class=encoders.RecurrentEncoder
+name="{}"
+input_sequence=<input_sequence>
+rnn_size={}
+rnn_cell="NematusGRU"
+dropout_keep_prob=1.0
+
+[input_sequence]
+class=model.sequence.EmbeddedSequence
+name="{}"
+vocabulary=<vocabulary_src>
+data_id="source"
+embedding_size={}
+max_length={}
+add_end_symbol=True
+"""
+
+
+def build_encoder(config: Dict) -> Tuple[RecurrentEncoder, str]:
+    vocabulary = from_nematus_json(
+        config["src_vocabulary"], max_size=config["n_words_src"],
+        pad_to_max_size=True)
+
+    vocabulary_ini = VOCABULARY_TEMPLATE.format(
+        "src", config["src_vocabulary"], config["n_words_src"])
+
+    inp_seq_name = "{}_input".format(ENCODER_NAME)
+    inp_seq = EmbeddedSequence(
+        name=inp_seq_name,
+        vocabulary=vocabulary,
+        data_id="source",
+        embedding_size=config["embedding_size"])
+
+    encoder = RecurrentEncoder(
+        name=ENCODER_NAME,
+        input_sequence=inp_seq,
+        rnn_size=config["rnn_size"],
+        rnn_cell="NematusGRU")
+
+    encoder_ini = ENCODER_TEMPLATE.format(
+        ENCODER_NAME, config["rnn_size"],
+        inp_seq_name, config["embedding_size"], config["max_length"])
+
+    return encoder, "\n".join([vocabulary_ini, encoder_ini])
+
+
+ATTENTION_TEMPLATE = """\
+[attention]
+class=attention.Attention
+name="{}"
+encoder=<encoder>
+dropout_keep_prob=1.0
+"""
+
+
+def build_attention(config: Dict,
+                    encoder: RecurrentEncoder) -> Tuple[Attention, str]:
+    attention = Attention(
+        name=ATTENTION_NAME,
+        encoder=encoder)
+
+    attention_ini = ATTENTION_TEMPLATE.format(ATTENTION_NAME)
+
+    return attention, attention_ini
+
+
+DECODER_TEMPLATE = """\
+[decoder]
+class=decoders.Decoder
+name="{}"
+vocabulary=<vocabulary_tgt>
+data_id="target"
+embedding_size={}
+rnn_size={}
+max_output_len={}
+encoders=[<encoder>]
+encoder_projection=<nematus_mean>
+attentions=[<attention>]
+attention_on_input=False
+conditional_gru=True
+output_projection=<nematus_nonlinear>
+rnn_cell="NematusGRU"
+dropout_keep_prob=1.0
+
+[nematus_nonlinear]
+class=decoders.output_projection.nematus_output
+output_size={}
+dropout_keep_prob=1.0
+
+[nematus_mean]
+class=decoders.encoder_projection.nematus_projection
+dropout_keep_prob=1.0
+"""
+
+
+def build_decoder(config: Dict,
+                  attention: Attention,
+                  encoder: RecurrentEncoder) -> Tuple[Decoder, str]:
+    vocabulary = from_nematus_json(
+        config["tgt_vocabulary"],
+        max_size=config["n_words_tgt"],
+        pad_to_max_size=True)
+
+    vocabulary_ini = VOCABULARY_TEMPLATE.format(
+        "tgt", config["tgt_vocabulary"], config["n_words_tgt"])
+
+    decoder = Decoder(
+        name=DECODER_NAME,
+        vocabulary=vocabulary,
+        data_id="target",
+        max_output_len=config["max_length"],
+        embedding_size=config["embedding_size"],
+        rnn_size=config["rnn_size"],
+        encoders=[encoder],
+        attentions=[attention],
+        attention_on_input=False,
+        conditional_gru=True,
+        encoder_projection=nematus_projection(dropout_keep_prob=1.0),
+        output_projection=nematus_output(config["embedding_size"]),
+        rnn_cell="NematusGRU")
+
+    decoder_ini = DECODER_TEMPLATE.format(
+        DECODER_NAME, config["embedding_size"], config["rnn_size"],
+        config["max_length"], config["embedding_size"])
+
+    return decoder, "\n".join([vocabulary_ini, decoder_ini])
+
+
+def build_model(config: Dict) -> Tuple[
+        RecurrentEncoder, Attention, Decoder, str]:
+    encoder, encoder_cfg = build_encoder(config)
+    attention, attention_cfg = build_attention(config, encoder)
+    decoder, decoder_cfg = build_decoder(config, attention, encoder)
+
+    ini = "\n".join([encoder_cfg, attention_cfg, decoder_cfg])
+
+    return ini
+
+
+def load_nematus_file(path: str) -> Dict[str, np.ndarray]:
+    contents = np.load(path)
+    cnt_dict = dict(contents)
+    contents.close()
+    return cnt_dict
+
+
+def assign_vars(variables: Dict[str, np.ndarray]) -> List[tf.Tensor]:
+    """For each variable in the map, assign the value from the dict"""
+
+    trainable_vars = tf.trainable_variables()
+    assign_ops = []
+
+    for var in trainable_vars:
+        map_key = var.op.name
+
+        if map_key not in VARIABLE_MAP:
+            raise ValueError("Map key {} not in variable map".format(map_key))
+
+        nem_var_list, fun = VARIABLE_MAP[map_key]
+
+        for nem_var in nem_var_list:
+            if nem_var not in variables:
+                raise ValueError("Alleged nematus var {} not found in loaded "
+                                 "nematus vars.".format(nem_var))
+
+        if fun is None:
+            if len(nem_var_list) != 1:
+                raise ValueError(
+                    "Var list for map key {} must have length 1. "
+                    "Length {} found instead."
+                    .format(map_key, len(nem_var_list)))
+
+            to_assign = variables[nem_var_list[0]]
+        else:
+            to_assign = fun([variables[v] for v in nem_var_list])
+
+        check_shape(var, to_assign)
+        assign_ops.append(tf.assign(var, to_assign))
+
+    return assign_ops
+
+
+INI_HEADER = """\
+; This is an automatically generated configuration file
+; for running imported nematus model
+; For further training, set the configuration as appropriate
+
+[main]
+name="nematus imported translation"
+tf_manager=<tf_manager>
+output="{}"
+runners=[<runner>]
+postprocess=None
+evaluation=[("target", evaluators.bleu.BLEU)]
+runners_batch_size=1
+
+; TODO Set these additional attributes for further training
+; batch_size=80
+; epochs=10
+; train_dataset=<train_data>
+; val_dataset=<val_data>
+; trainer=<trainer>
+; logging_period=20
+; validation_period=60
+; random_seed=1234
+
+; [train_data]
+; class=dataset.load_dataset_from_files
+; s_source="PATH/TO/DATA" ; TODO do not forget to fill this out!
+; s_target="PATH/TO/DATA" ; TODO do not forget to fill this out!
+; lazy=True
+
+; [val_data]
+; class=dataset.load_dataset_from_files
+; s_source="PATH/TO/DATA" ; TODO do not forget to fill this out!
+; s_target="PATH/TO/DATA" ; TODO do not forget to fill this out!
+
+; [trainer]
+; class=trainers.cross_entropy_trainer.CrossEntropyTrainer
+; decoders=[<decoder>]
+; l2_weight=1.0e-8
+; clip_norm=1.0
+
+[tf_manager]
+class=tf_manager.TensorFlowManager
+num_threads=4
+num_sessions=1
+
+[runner]
+class=runners.runner.GreedyRunner
+decoder=<decoder>
+output_series="target"
+"""
+
+
+def write_config(experiment_dir: str, ini: str) -> None:
+    experiment_file = os.path.join(experiment_dir, "experiment.ini")
+    with open(experiment_file, "w", encoding="utf-8") as f_out:
+        f_out.write(INI_HEADER.format(experiment_dir))
+        f_out.write(ini)
+
+
+def prepare_output_dir(output_dir: str) -> bool:
+    if os.path.isdir(output_dir):
+        log("Directory {} already exists. Choose a nonexistent one.".
+            format(output_dir))
+        exit(1)
+
+    os.mkdir(output_dir)
+
+
+def main() -> None:
+    log("Script started.")
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("nematus_json", metavar="NEMATUS-JSON",
+                        help="nematus json file")
+    parser.add_argument("nematus_variables", metavar="NEMATUS-FILE",
+                        help="nematus variable file")
+    parser.add_argument("output_dir", metavar="OUTPUT-DIR",
+                        help="output directory")
+    args = parser.parse_args()
+
+    log("Loading nematus variables from {}.".format(args.nematus_variables))
+    nematus_vars = load_nematus_file(args.nematus_variables)
+
+    log("Loading nematus JSON config from {}.".format(args.nematus_json))
+    nematus_json_cfg = load_nematus_json(args.nematus_json)
+
+    log("Bulding model.")
+    ini = build_model(nematus_json_cfg)
+
+    log("Defining assign Ops.")
+    assign_ops = assign_vars(nematus_vars)
+
+    log("Preparing output directory {}".format(args.output_dir))
+    prepare_output_dir(args.output_dir)
+
+    log("Writing configuration file to {}/experiment.ini."
+        .format(args.output_dir))
+    write_config(args.output_dir, ini)
+
+    log("Creating TF session.")
+    s = tf.Session()
+
+    log("Running session to assign to Neural Monkey variables.")
+    s.run(assign_ops)
+
+    log("Initializing saver.")
+    saver = tf.train.Saver()
+
+    variables_file = os.path.join(args.output_dir, "variables.data")
+    log("Saving variables to {}".format(variables_file))
+    saver.save(s, variables_file)
+
+    log("Finished.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/small.ini b/tests/small.ini
index 7ff4f28bb..512f711f8 100644
--- a/tests/small.ini
+++ b/tests/small.ini
@@ -59,6 +59,7 @@ embedding_size=11
 dropout_keep_prob=0.5
 data_id="source"
 vocabulary=<encoder_vocabulary>
+rnn_cell="NematusGRU"
 
 [attention]
 class=attention.Attention
@@ -81,6 +82,8 @@ dropout_keep_prob=0.5
 data_id="target"
 max_output_len=1
 vocabulary=<decoder_vocabulary>
+attention_on_input=False
+rnn_cell="NematusGRU"
 
 [trainer]
 ; This block just fills the arguments of the trainer __init__ method.