From fa264780e28308c5f3b983dfc36fe61401bd8799 Mon Sep 17 00:00:00 2001
From: Max <max@motovilov.com>
Date: Fri, 29 Jun 2018 08:10:49 -0500
Subject: [PATCH 1/3] Allow multi-GPU training (removes inadvertent
 restrictions in the code)

Fix: restore compatibility with single-GPU configs

Remove extraneous code (model is reassigned to CUDA device in __init__)

Restore GPU device check in the command shell
---
 allennlp/commands/train.py   | 7 ++++++-
 allennlp/training/trainer.py | 4 +---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/allennlp/commands/train.py b/allennlp/commands/train.py
index dd67f907404..13a18af9a06 100644
--- a/allennlp/commands/train.py
+++ b/allennlp/commands/train.py
@@ -253,7 +253,12 @@ def train_model(params: Params,
     create_serialization_dir(params, serialization_dir, recover)
     prepare_global_logging(serialization_dir, file_friendly_logging)
 
-    check_for_gpu(params.params.get('trainer').get('cuda_device', -1))
+    cuda_device = params.params.get('trainer').get('cuda_device', -1)
+    if isinstance(cuda_device, list):
+        for device in cuda_device:
+             check_for_gpu(device)
+    else:
+        check_for_gpu(cuda_device)
 
     serialization_params = deepcopy(params).as_dict(quiet=True)
     with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
diff --git a/allennlp/training/trainer.py b/allennlp/training/trainer.py
index 8dcbd7e3d91..2a9dbd84acd 100644
--- a/allennlp/training/trainer.py
+++ b/allennlp/training/trainer.py
@@ -926,13 +926,11 @@ def from_params(cls,
         patience = params.pop_int("patience", None)
         validation_metric = params.pop("validation_metric", "-loss")
         num_epochs = params.pop_int("num_epochs", 20)
-        cuda_device = params.pop_int("cuda_device", -1)
+        cuda_device = params.pop( "cuda_device")
         grad_norm = params.pop_float("grad_norm", None)
         grad_clipping = params.pop_float("grad_clipping", None)
         lr_scheduler_params = params.pop("learning_rate_scheduler", None)
 
-        if cuda_device >= 0:
-            model = model.cuda(cuda_device)
         parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad]
         optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))
 

From 82b0d8daee2b62345d824d644d06ce2f29027498 Mon Sep 17 00:00:00 2001
From: Max <max@motovilov.com>
Date: Fri, 29 Jun 2018 11:45:06 -0500
Subject: [PATCH 2/3] Chunk metadata batches when training multiple GPUs
 (#1439)

---
 allennlp/training/trainer.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/allennlp/training/trainer.py b/allennlp/training/trainer.py
index 2a9dbd84acd..7d5ff1194e0 100644
--- a/allennlp/training/trainer.py
+++ b/allennlp/training/trainer.py
@@ -378,7 +378,20 @@ def _data_parallel(self, batch):
         of torch.nn.parallel.data_parallel to support the allennlp model
         interface.
         """
+        metadata_batch_size = len(batch['metadata']) if 'metadata' in batch and isinstance(batch['metadata'],list) else None
+
         inputs, module_kwargs = scatter_kwargs((), batch, self._cuda_devices, 0)
+
+        if metadata_batch_size is not None:
+            # Metadata batches also have to be chunked as PyTorch is unaware of them.
+            # Follows chunking implementation by ATen.native.TensorShape functions.
+            chunk_size = 1 + (metadata_batch_size - 1)//len(self._cuda_devices)
+            chunk_offset = 0
+            for instance in module_kwargs:
+                if 'metadata' in instance:
+                     instance['metadata'] = instance['metadata'][chunk_offset:chunk_size+chunk_offset]
+                     chunk_offset += chunk_size
+
         used_device_ids = self._cuda_devices[:len(inputs)]
         replicas = replicate(self._model, used_device_ids)
         outputs = parallel_apply(replicas, inputs, module_kwargs, used_device_ids)

From d3cadd936a132d6f0e75adee6a163c54058de546 Mon Sep 17 00:00:00 2001
From: "paul.murphy" <paul.murphy@teckro.com>
Date: Mon, 2 Jul 2018 12:02:58 +0100
Subject: [PATCH 3/3] multi para predictor.

---
 .../reading_comprehension/triviaqa.py         |  2 +-
 allennlp/predictors/__init__.py               |  1 +
 allennlp/predictors/multi_para.py             | 42 +++++++++++++++++++
 3 files changed, 44 insertions(+), 1 deletion(-)
 create mode 100644 allennlp/predictors/multi_para.py

diff --git a/allennlp/data/dataset_readers/reading_comprehension/triviaqa.py b/allennlp/data/dataset_readers/reading_comprehension/triviaqa.py
index e8335c6ae52..c37682b4117 100644
--- a/allennlp/data/dataset_readers/reading_comprehension/triviaqa.py
+++ b/allennlp/data/dataset_readers/reading_comprehension/triviaqa.py
@@ -499,7 +499,7 @@ def text_to_instance(self,  # type: ignore
             paragraph_tokens = [[truncate_token(token, self._max_token_length) for token in tokens]
                                 for tokens in paragraph_tokens]
 
-        if token_spans is None:
+        if token_spans is None and answer_texts is not None:
             token_spans = [util.find_valid_answer_spans(paragraph_tokens_i, answer_texts)
                            for paragraph_tokens_i in paragraph_tokens]
         if question_tokens is None:
diff --git a/allennlp/predictors/__init__.py b/allennlp/predictors/__init__.py
index 0be6fc4ee6e..598dd914678 100644
--- a/allennlp/predictors/__init__.py
+++ b/allennlp/predictors/__init__.py
@@ -16,3 +16,4 @@
 from allennlp.predictors.simple_seq2seq import SimpleSeq2SeqPredictor
 from allennlp.predictors.wikitables_parser import WikiTablesParserPredictor
 from allennlp.predictors.nlvr_parser import NlvrParserPredictor
+from allennlp.predictors.multi_para import MultiParaPredictor
diff --git a/allennlp/predictors/multi_para.py b/allennlp/predictors/multi_para.py
new file mode 100644
index 00000000000..2a9422506e7
--- /dev/null
+++ b/allennlp/predictors/multi_para.py
@@ -0,0 +1,42 @@
+from typing import Tuple
+from overrides import overrides
+
+from allennlp.common.util import JsonDict
+from allennlp.data import Instance
+from allennlp.predictors.predictor import Predictor
+
+
+@Predictor.register('multi-para')
+class MultiParaPredictor(Predictor):
+    """
+    Predictor for the :class:`~allennlp.models.bidaf.BidirectionalAttentionFlow` model.
+    """
+
+    def predict(self, question: str, passage: str) -> JsonDict:
+        """
+        Make a machine comprehension prediction on the supplied input.
+        See https://rajpurkar.github.io/SQuAD-explorer/ for more information about the machine comprehension task.
+
+        Parameters
+        ----------
+        question : ``str``
+            A question about the content in the supplied paragraph.  The question must be answerable by a
+            span in the paragraph.
+        passage : ``str``
+            A paragraph of information relevant to the question.
+
+        Returns
+        -------
+        A dictionary that represents the prediction made by the system.  The answer string will be under the
+        "best_span_str" key.
+        """
+        return self.predict_json({"passage": passage, "question": question})
+
+    @overrides
+    def _json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]:
+        """
+        Expects JSON that looks like ``{"question": "...", "passage": "..."}``.
+        """
+        question_text = json_dict["question"]
+        passage_texts = json_dict["passages"]
+        return self._dataset_reader.text_to_instance(question_text, passage_texts), {}