2 년 전 · b31b6a9c14
--- a/evaluate.py
+++ b/evaluate.py
@@ -56,7 +56,7 @@ def main():
 
				     print_rank_0(f"> Successfully load {len(task_classes)} task{'s' if len(task_classes) > 1 else ''}")
			
 
				 
			
 
				     model, tokenizer = initialize_model_and_tokenizer(args)
			
 
				-    model = ModelForEvaluation(model)
			
 
				+    model = ModelForEvaluation(model, args.position_encoding_2d)
			
 
				 
			
 
				     start = time.time()
			
 
				     evaluate_all_tasks(args.data_path, model, tokenizer, args.task, task_classes)
			
--- a/evaluation/dataset.py
+++ b/evaluation/dataset.py
@@ -15,17 +15,21 @@ from SwissArmyTransformer import get_tokenizer
 
				 
			
 
				 from .configs import BaseConfig, MultiChoiceTaskConfig, GenerationTaskConfig, LanguageModelTaskConfig
			
 
				 from .utils import get_tokenized_input
			
 
				+from .model import ModelForEvaluation
			
 
				 
			
 
				 
			
 
				 def pad_batch(tokens, position_ids, attention_mask, max_seq_length):
			
 
				+    pad_length = max_seq_length - len(tokens)
			
 
				     attention_mask = np.pad(
			
 
				         attention_mask,
			
 
				-        pad_width=((0, max_seq_length - len(tokens)),),
			
 
				+        pad_width=((0, pad_length),),
			
 
				         mode="constant",
			
 
				         constant_values=0,
			
 
				     )
			
 
				-    tokens = np.concatenate((tokens, np.zeros(max_seq_length - len(tokens), dtype=np.int64)))
			
 
				-    position_ids = np.concatenate((position_ids, np.zeros(max_seq_length - len(position_ids), dtype=np.int64)))
			
 
				+    tokens = np.concatenate((tokens, np.zeros(pad_length, dtype=np.int64)))
			
 
				+    position_ids = np.concatenate(
			
 
				+        (position_ids, np.zeros_like(position_ids[..., -1:], dtype=np.int64).repeat(pad_length, -1)), axis=-1
			
 
				+    )
			
 
				     return tokens, position_ids, attention_mask
			
 
				 
			
 
				 
			
@@ -39,8 +43,9 @@ class EvaluationDataset(torch.utils.data.Dataset, ABC):
 
				     If [MASK] not in context, will append [MASK] after text
			
 
				     """
			
 
				 
			
 
				-    def __init__(self, path: Union[str, List[str]], config: BaseConfig):
			
 
				+    def __init__(self, path: Union[str, List[str]], model: ModelForEvaluation, config: BaseConfig):
			
 
				         self.path = path if isinstance(path, list) else [path]
			
 
				+        self.model = model
			
 
				         self.config = config
			
 
				         self.max_seq_length = self.config.max_seq_length
			
 
				         self.dtype = np.int64
			
@@ -170,9 +175,9 @@ class GenerationTaskDataset(EvaluationDataset):
 
				 class MultiChoiceTaskDataset(EvaluationDataset):
			
 
				     config: MultiChoiceTaskConfig
			
 
				 
			
 
				-    def __init__(self, path, config: MultiChoiceTaskConfig):
			
 
				+    def __init__(self, path: Union[str, List[str]], model: ModelForEvaluation, config: BaseConfig):
			
 
				         self.is_single_token = True  # set to False later in process_single_item func
			
 
				-        super().__init__(path, config)
			
 
				+        super().__init__(path, model, config)
			
 
				 
			
 
				     @property
			
 
				     def has_collate_fn(self) -> bool:
			
@@ -226,88 +231,9 @@ class MultiChoiceTaskDataset(EvaluationDataset):
 
				 
			
 
				         return [{"text": text, "choices": choices, "label": label, **kwargs}]
			
 
				 
			
 
				-    @staticmethod
			
 
				-    def build_multiple_choice_sample(
			
 
				-        text,
			
 
				-        choices,
			
 
				-        is_single_token,
			
 
				-        unified_multitask_encoding=False,
			
 
				-        unidirectional=False,
			
 
				-        use_task_mask=False,
			
 
				-    ):
			
 
				-        tokenizer = get_tokenizer()
			
 
				-
			
 
				-        sop_id = tokenizer.get_command("sop")
			
 
				-        mask_id = tokenizer.get_command("[gMASK]") if use_task_mask else tokenizer.get_command("[MASK]")
			
 
				-
			
 
				-        token = np.array(text, dtype=np.int64)
			
 
				-        target = np.array(text, dtype=np.int64)
			
 
				-        position_id = np.arange(len(text), dtype=np.int64)
			
 
				-        choice_target_id = []
			
 
				-
			
 
				-        blank_filling = mask_id in text
			
 
				-        if not blank_filling:
			
 
				-            if unidirectional:
			
 
				-                assert use_task_mask
			
 
				-                token = np.concatenate(([mask_id, sop_id], token[:-1]))
			
 
				-                target = np.concatenate(([mask_id, sop_id], target[:-1]))
			
 
				-                position_id = np.arange(len(token), dtype=np.int64)
			
 
				-                mask_position = len(token)
			
 
				-            else:
			
 
				-                mask_position = len(token)
			
 
				-                token = np.concatenate((token, [mask_id]))
			
 
				-                target = np.concatenate((target, [mask_id]))
			
 
				-                position_id = np.concatenate((position_id, [mask_position]))
			
 
				-        else:
			
 
				-            assert not unidirectional, "Unidirectional attention doesn't support blank filling"
			
 
				-            assert not use_task_mask, "Unidirectional attention doesn't support task mask"
			
 
				-            mask_position = text.index(mask_id)
			
 
				-
			
 
				-        division = len(token)
			
 
				-        attention_mask = [np.ones((len(token), len(token)), dtype=np.int64)]
			
 
				-        if unidirectional:
			
 
				-            attention_mask[0] = np.tril(attention_mask[0])
			
 
				-
			
 
				-        for choice in choices:
			
 
				-            if not choice:
			
 
				-                choice = [tokenizer.get_command("eop")]
			
 
				-            position_id = np.concatenate(
			
 
				-                (
			
 
				-                    position_id,
			
 
				-                    [mask_position] * len(choice)
			
 
				-                    if (blank_filling or not unified_multitask_encoding) and not use_task_mask
			
 
				-                    else np.arange(mask_position, mask_position + len(choice), dtype=np.int64),
			
 
				-                )
			
 
				-            )
			
 
				-            choice_target_id.append(np.arange(len(token), len(token) + len(choice), dtype=np.int64))
			
 
				-            attention_mask.append(np.tril(np.ones((len(choice), len(choice)), dtype=np.int64)))
			
 
				-            if unidirectional:
			
 
				-                token = np.concatenate((token, [text[-1]], choice[:-1]))
			
 
				-            else:
			
 
				-                token = np.concatenate((token, [sop_id], choice[:-1]))
			
 
				-            target = np.concatenate((target, choice))
			
 
				-
			
 
				-            if is_single_token:
			
 
				-                break
			
 
				-
			
 
				-        attention_mask = block_diag(*attention_mask)
			
 
				-        attention_mask[division:, :division] = 1
			
 
				-
			
 
				-        if is_single_token:
			
 
				-            choices = np.array(choices, dtype=np.int64).squeeze().tolist()
			
 
				-
			
 
				-        item = {
			
 
				-            "token": token,
			
 
				-            "position_id": position_id,
			
 
				-            "attention_mask": attention_mask,
			
 
				-            "choices": choices,
			
 
				-            "choice_target_ids": choice_target_id[0] if is_single_token else choice_target_id,
			
 
				-        }
			
 
				-        return item
			
 
				-
			
 
				     def __getitem__(self, idx):
			
 
				         item = self.data[idx]
			
 
				-        sample = self.build_multiple_choice_sample(
			
 
				+        sample = self.model.build_multiple_choice_sample(
			
 
				             item["text"],
			
 
				             item["choices"],
			
 
				             is_single_token=self.is_single_token,
			
@@ -358,27 +284,11 @@ class LanguageModelTaskDataset(EvaluationDataset):
 
				         end_idx = start_idx + self.config.max_seq_length - 1  # for additional [gMASK]
			
 
				         tokens = self.data[document_idx]["raw_text"][start_idx:end_idx]
			
 
				 
			
 
				-        mask_id = self.gmask_id if self.config.use_task_mask else self.mask_id
			
 
				-        sop_id = self.tokenizer.get_command("sop")
			
 
				-
			
 
				-        if idx == 0 or self.config.unidirectional:
			
 
				-            prompt, text = [], tokens
			
 
				-        else:
			
 
				-            prompt_length = self.config.max_seq_length - 1 - self.config.generation_length
			
 
				-            prompt, text = tokens[:prompt_length], tokens[prompt_length:]
			
 
				-
			
 
				-        seq_length = len(prompt) + len(text) + 1
			
 
				-        attention_mask = np.tril(np.ones((seq_length, seq_length), dtype=np.int64))
			
 
				-        attention_mask[: len(prompt) + 1, : len(prompt) + 1] = 1
			
 
				-
			
 
				-        gen_length = min(len(text), self.config.generation_length)
			
 
				-        return {
			
 
				-            "tokens": np.array(prompt + [mask_id, sop_id] + text[:-1], dtype=np.int64),
			
 
				-            "targets": np.array(prompt + [mask_id] + text, dtype=np.int64),
			
 
				-            "position_ids": np.arange(0, seq_length, dtype=np.int64),
			
 
				-            "attention_mask": attention_mask < 0.5,
			
 
				-            "loss_masks": np.array(
			
 
				-                [0] * (seq_length - gen_length) + [1] * gen_length,
			
 
				-                dtype=np.int64,
			
 
				-            ),
			
 
				-        }
			
 
				+        return self.model.build_language_model_sample(
			
 
				+            tokens,
			
 
				+            is_first_segment=idx == 0,
			
 
				+            max_seq_length=self.config.max_seq_length,
			
 
				+            generation_length=self.config.generation_length,
			
 
				+            unidirectional=self.config.unidirectional,
			
 
				+            use_gmask=self.config.use_task_mask,
			
 
				+        )
			
--- a/evaluation/model.py
+++ b/evaluation/model.py
@@ -1,9 +1,12 @@
 
				+import numpy as np
			
 
				 import torch
			
 
				 
			
 
				 from typing import List, Union
			
 
				+from scipy.linalg import block_diag
			
 
				 
			
 
				 from SwissArmyTransformer.generation.autoregressive_sampling import update_mems, get_masks_and_position_ids_default
			
 
				 from SwissArmyTransformer.mpu import vocab_parallel_cross_entropy
			
 
				+from SwissArmyTransformer import get_tokenizer
			
 
				 
			
 
				 
			
 
				 def batch_filling_sequence(
			
@@ -71,7 +74,9 @@ def batch_filling_sequence(
 
				         if len(tokens.shape) == 3 and num_beams == 1:
			
 
				             num_beams = tokens.shape[1]
			
 
				             position_ids = (
			
 
				-                position_ids.unsqueeze(1).expand(batch_size, num_beams, -1).reshape(batch_size * num_beams, -1)
			
 
				+                position_ids.unsqueeze(1)
			
 
				+                .expand((batch_size, num_beams) + position_ids.shape[1:])
			
 
				+                .reshape((batch_size * num_beams,) + position_ids.shape[1:])
			
 
				             )
			
 
				             attention_mask_shape = attention_mask.shape[-3:]
			
 
				             attention_mask = (
			
@@ -85,10 +90,11 @@ def batch_filling_sequence(
 
				 
			
 
				 
			
 
				 class ModelForEvaluation(torch.nn.Module):
			
 
				-    def __init__(self, model):
			
 
				+    def __init__(self, model, position_encoding_2d):
			
 
				         super().__init__()
			
 
				 
			
 
				         self.model = model
			
 
				+        self.position_encoding_2d = position_encoding_2d
			
 
				         self.device = next(self.model.parameters()).device
			
 
				 
			
 
				     @staticmethod
			
@@ -99,6 +105,115 @@ class ModelForEvaluation(torch.nn.Module):
 
				             batch["attention_mask"].to(device=device).bool().unsqueeze(1),
			
 
				         )
			
 
				 
			
 
				+    def build_multiple_choice_sample(
			
 
				+        self,
			
 
				+        text,
			
 
				+        choices,
			
 
				+        is_single_token,
			
 
				+        unified_multitask_encoding=False,
			
 
				+        unidirectional=False,
			
 
				+        use_task_mask=False,
			
 
				+    ):
			
 
				+        tokenizer = get_tokenizer()
			
 
				+
			
 
				+        sop_id = tokenizer.get_command("sop")
			
 
				+        mask_id = tokenizer.get_command("[gMASK]") if use_task_mask else tokenizer.get_command("[MASK]")
			
 
				+
			
 
				+        token = np.array(text, dtype=np.int64)
			
 
				+        target = np.array(text, dtype=np.int64)
			
 
				+        position_id = np.arange(len(text), dtype=np.int64)
			
 
				+        block_position_id = np.zeros(len(text), dtype=np.int64)
			
 
				+        choice_target_id = []
			
 
				+
			
 
				+        blank_filling = mask_id in text
			
 
				+        if not blank_filling:
			
 
				+            if unidirectional:
			
 
				+                assert use_task_mask, "Unidirectional attention only support gMASK"
			
 
				+                token = np.concatenate(([mask_id, sop_id], token[:-1]))
			
 
				+                target = np.concatenate(([mask_id, sop_id], target[:-1]))
			
 
				+                position_id = np.zeros(len(token), dtype=np.int64)
			
 
				+                if self.position_encoding_2d:
			
 
				+                    block_position_id = np.arange(len(token), dtype=np.int64)
			
 
				+                mask_position = len(token)
			
 
				+            else:
			
 
				+                mask_position = len(token)
			
 
				+                token = np.concatenate((token, [mask_id]))
			
 
				+                target = np.concatenate((target, [mask_id]))
			
 
				+                position_id = np.arange(len(token), dtype=np.int64)
			
 
				+                if self.position_encoding_2d:
			
 
				+                    block_position_id = np.zeros(len(token), dtype=np.int64)
			
 
				+        else:
			
 
				+            assert not unidirectional, "Unidirectional attention doesn't support blank filling"
			
 
				+            assert not use_task_mask, "Blank filling only support MASK"
			
 
				+            mask_position = text.index(mask_id)
			
 
				+
			
 
				+        division = len(token)
			
 
				+        attention_mask = [np.ones((len(token), len(token)), dtype=np.int64)]
			
 
				+        if unidirectional:
			
 
				+            attention_mask[0] = np.tril(attention_mask[0])
			
 
				+
			
 
				+        for choice in choices:
			
 
				+            if not choice:
			
 
				+                choice = [tokenizer.get_command("eop")]
			
 
				+
			
 
				+            target = np.concatenate((target, choice))
			
 
				+            choice_target_id.append(np.arange(len(token), len(token) + len(choice), dtype=np.int64))
			
 
				+            attention_mask.append(np.tril(np.ones((len(choice), len(choice)), dtype=np.int64)))
			
 
				+
			
 
				+            if unidirectional:
			
 
				+                if self.position_encoding_2d:
			
 
				+                    position_id = np.concatenate((position_id, [0] * len(choice)))
			
 
				+                    block_position_id = np.concatenate(
			
 
				+                        (block_position_id, np.arange(mask_position, mask_position + len(choice), dtype=np.int64))
			
 
				+                    )
			
 
				+                else:
			
 
				+                    position_id = np.concatenate(
			
 
				+                        (
			
 
				+                            position_id,
			
 
				+                            np.arange(mask_position, mask_position + len(choice), dtype=np.int64),
			
 
				+                        )
			
 
				+                    )
			
 
				+
			
 
				+                token = np.concatenate((token, [text[-1]], choice[:-1]))
			
 
				+            else:
			
 
				+                if self.position_encoding_2d:
			
 
				+                    position_id = np.concatenate((position_id, [mask_position] * len(choice)))
			
 
				+                    block_position_id = np.concatenate(
			
 
				+                        (block_position_id, np.arange(1, 1 + len(choice), dtype=np.int64))
			
 
				+                    )
			
 
				+                else:
			
 
				+                    position_id = np.concatenate(
			
 
				+                        (
			
 
				+                            position_id,
			
 
				+                            [mask_position] * len(choice)
			
 
				+                            if (blank_filling or not unified_multitask_encoding) and not use_task_mask
			
 
				+                            else np.arange(mask_position, mask_position + len(choice), dtype=np.int64),
			
 
				+                        )
			
 
				+                    )
			
 
				+
			
 
				+                token = np.concatenate((token, [sop_id], choice[:-1]))
			
 
				+
			
 
				+            if is_single_token:
			
 
				+                break
			
 
				+
			
 
				+        attention_mask = block_diag(*attention_mask)
			
 
				+        attention_mask[division:, :division] = 1
			
 
				+
			
 
				+        if is_single_token:
			
 
				+            choices = np.array(choices, dtype=np.int64).squeeze().tolist()
			
 
				+
			
 
				+        if self.position_encoding_2d:
			
 
				+            position_id = np.stack((position_id, block_position_id), axis=0)
			
 
				+
			
 
				+        item = {
			
 
				+            "token": token,
			
 
				+            "position_id": position_id,
			
 
				+            "attention_mask": attention_mask,
			
 
				+            "choices": choices,
			
 
				+            "choice_target_ids": choice_target_id[0] if is_single_token else choice_target_id,
			
 
				+        }
			
 
				+        return item
			
 
				+
			
 
				     def cond_log_prob(self, batch) -> List[List[float]]:
			
 
				         """
			
 
				         @return: Conditional log probability of each option
			
@@ -115,6 +230,12 @@ class ModelForEvaluation(torch.nn.Module):
 
				         # output: [b, sq, vocab]
			
 
				         log_probs = []
			
 
				 
			
 
				+        # if torch.distributed.get_rank() == 0:
			
 
				+        #     import pdb
			
 
				+        #
			
 
				+        #     pdb.set_trace()
			
 
				+        # torch.distributed.barrier()
			
 
				+
			
 
				         if is_single_token:  # Single token
			
 
				             for logits, choices, choice_target_ids in zip(logits_batch, choices_batch, choice_target_ids_batch):
			
 
				                 log_probs.append(logits[choice_target_ids[0], choices].tolist())
			
@@ -184,6 +305,52 @@ class ModelForEvaluation(torch.nn.Module):
 
				                 output_targets.append(output_target)
			
 
				         return output_targets
			
 
				 
			
 
				+    def build_language_model_sample(
			
 
				+        self,
			
 
				+        tokens: List[int],
			
 
				+        is_first_segment: bool,
			
 
				+        max_seq_length: int,
			
 
				+        generation_length: int,
			
 
				+        unidirectional: bool,
			
 
				+        use_gmask: bool,
			
 
				+    ):
			
 
				+        tokenizer = get_tokenizer()
			
 
				+        sop_id = tokenizer.get_command("sop")
			
 
				+        mask_id = tokenizer.get_command("[gMASK]") if use_gmask else tokenizer.get_command("[MASK]")
			
 
				+
			
 
				+        if is_first_segment or unidirectional:
			
 
				+            prompt, text = [], tokens
			
 
				+        else:
			
 
				+            prompt_length = max_seq_length - 1 - generation_length
			
 
				+            prompt, text = tokens[:prompt_length], tokens[prompt_length:]
			
 
				+
			
 
				+        seq_length = len(prompt) + len(text) + 1
			
 
				+        attention_mask = np.tril(np.ones((seq_length, seq_length), dtype=np.int64))
			
 
				+        attention_mask[: len(prompt) + 1, : len(prompt) + 1] = 1
			
 
				+
			
 
				+        gen_length = min(len(text), generation_length)
			
 
				+
			
 
				+        position_id = np.arange(0, seq_length, dtype=np.int64)
			
 
				+        if self.position_encoding_2d:
			
 
				+            position_id = np.concatenate(
			
 
				+                (np.arange(0, seq_length - gen_length, dtype=np.int64), [seq_length - gen_length - 1] * gen_length)
			
 
				+            )
			
 
				+            block_position_id = np.concatenate(
			
 
				+                ([0] * (seq_length - gen_length - 1), np.arange(0, gen_length + 1, dtype=np.int64))
			
 
				+            )
			
 
				+            position_id = np.stack((position_id, block_position_id), axis=0)
			
 
				+
			
 
				+        return {
			
 
				+            "tokens": np.array(prompt + [mask_id, sop_id] + text[:-1], dtype=np.int64),
			
 
				+            "targets": np.array(prompt + [mask_id] + text, dtype=np.int64),
			
 
				+            "position_ids": position_id,
			
 
				+            "attention_mask": attention_mask < 0.5,
			
 
				+            "loss_masks": np.array(
			
 
				+                [0] * (seq_length - gen_length) + [1] * gen_length,
			
 
				+                dtype=np.int64,
			
 
				+            ),
			
 
				+        }
			
 
				+
			
 
				     def calculate_loss(self, batch) -> List[float]:
			
 
				         tokens, position_ids, attention_mask = self.process_data(batch, self.device)
			
 
				         targets, loss_masks = (
			
--- a/evaluation/tasks.py
+++ b/evaluation/tasks.py
@@ -170,7 +170,7 @@ class GenerationTask(BaseTask, ABC):
 
				         return GenerationTaskConfig
			
 
				 
			
 
				     def build_dataset(self, relative_path):
			
 
				-        return GenerationTaskDataset(join(self.config.path, relative_path), self.config)
			
 
				+        return GenerationTaskDataset(join(self.config.path, relative_path), self.model, self.config)
			
 
				 
			
 
				     def save_prediction_to_file(self, file, prediction, data):
			
 
				         filename = os.path.join("outputs", self.config.name, f"{file}.predict")
			
@@ -218,7 +218,7 @@ class MultiChoiceTask(BaseTask, ABC):
 
				         return MultiChoiceTaskConfig
			
 
				 
			
 
				     def build_dataset(self, relative_path):
			
 
				-        return MultiChoiceTaskDataset(join(self.config.path, relative_path), self.config)
			
 
				+        return MultiChoiceTaskDataset(join(self.config.path, relative_path), self.model, self.config)
			
 
				 
			
 
				     def predict_single_batch(self, batch) -> List[int]:
			
 
				         log_probs = self.model.cond_log_prob(batch)
			
@@ -233,7 +233,7 @@ class LanguageModelTask(BaseTask, ABC):
 
				         return LanguageModelTaskConfig
			
 
				 
			
 
				     def build_dataset(self, relative_path):
			
 
				-        return LanguageModelTaskDataset(join(self.config.path, relative_path), self.config)
			
 
				+        return LanguageModelTaskDataset(join(self.config.path, relative_path), self.model, self.config)
			
 
				 
			
 
				     def predict_single_batch(self, batch) -> List[float]:
			
 
				         return self.model.calculate_loss(batch)
			
--- a/generate.py
+++ b/generate.py
@@ -30,7 +30,7 @@ def isEnglish(s):
 
				         return True
			
 
				 
			
 
				 
			
 
				-def get_masks_and_position_ids(seq, mask_position, max_gen_length, gmask=False):
			
 
				+def get_masks_and_position_ids(seq, mask_position, max_gen_length, gmask=False, position_encoding_2d=False):
			
 
				     context_length = seq.shape[1]
			
 
				     tokens = torch.nn.functional.pad(seq, (0, max_gen_length), mode="constant", value=-1)
			
 
				     attention_mask = torch.ones((1, tokens.shape[-1], tokens.shape[-1]), device=tokens.device)
			
@@ -39,9 +39,20 @@ def get_masks_and_position_ids(seq, mask_position, max_gen_length, gmask=False):
 
				     attention_mask.unsqueeze_(1)
			
 
				     attention_mask = (attention_mask < 0.5).bool()
			
 
				 
			
 
				-    position_ids = torch.arange(tokens.shape[-1], dtype=torch.long, device=tokens.device)
			
 
				-    if not gmask:
			
 
				+    if position_encoding_2d:
			
 
				+        position_ids = torch.arange(tokens.shape[-1], dtype=torch.long, device=tokens.device)
			
 
				         position_ids[context_length - 1 :] = mask_position
			
 
				+        block_position_ids = torch.cat(
			
 
				+            (
			
 
				+                torch.zeros(context_length - 2, dtype=torch.long, device=tokens.device),
			
 
				+                torch.arange(tokens.shape[-1] - (context_length - 2), dtype=torch.long, device=tokens.device),
			
 
				+            )
			
 
				+        )
			
 
				+        position_ids = torch.vstack((position_ids, block_position_ids))
			
 
				+    else:
			
 
				+        position_ids = torch.arange(tokens.shape[-1], dtype=torch.long, device=tokens.device)
			
 
				+        if not gmask:
			
 
				+            position_ids[context_length - 1 :] = mask_position
			
 
				 
			
 
				     position_ids = position_ids.unsqueeze(0)
			
 
				 
			
@@ -115,6 +126,7 @@ def fill_blanks(raw_text: str, model, tokenizer, strategy) -> Tuple[List[str], L
 
				                 mask_position=mask_position,
			
 
				                 max_gen_length=args.out_seq_length,
			
 
				                 gmask=use_gmask,
			
 
				+                position_encoding_2d=args.position_encoding_2d,
			
 
				             ),
			
 
				         )
			
 
				         if isinstance(output, torch.Tensor):  # different strategies
			
--- a/initialize.py
+++ b/initialize.py
@@ -100,7 +100,13 @@ def initialize_model_and_tokenizer(args):
 
				     with torch.no_grad():
			
 
				         _, *_ = model(
			
 
				             torch.ones(1, args.max_sequence_length, device=torch.cuda.current_device(), dtype=torch.int64),
			
 
				-            torch.arange(args.max_sequence_length, device=torch.cuda.current_device(), dtype=torch.int64).view(1, -1),
			
 
				+            torch.arange(args.max_sequence_length, device=torch.cuda.current_device(), dtype=torch.int64)
			
 
				+            .view(1, 1, -1)
			
 
				+            .repeat(1, 2, 1)
			
 
				+            if args.position_encoding_2d
			
 
				+            else torch.arange(args.max_sequence_length, device=torch.cuda.current_device(), dtype=torch.int64).view(
			
 
				+                1, -1
			
 
				+            ),
			
 
				             torch.randn(
			
 
				                 1,
			
 
				                 1,
			
--- a/server.py
+++ b/server.py
@@ -1,164 +1,11 @@
 
				-import os
			
 
				+import time
			
 
				 import torch
			
 
				-import stat
			
 
				-import re
			
 
				-
			
 
				-from functools import partial
			
 
				-from typing import List, Tuple
			
 
				-
			
 
				-from SwissArmyTransformer import mpu
			
 
				-from evaluation.model import batch_filling_sequence
			
 
				-from generation import BeamSearchStrategy, BaseStrategy
			
 
				-from SwissArmyTransformer.generation.utils import timed_name
			
 
				-from initialize import initialize, initialize_model_and_tokenizer
			
 
				-
			
 
				 import torch.distributed as dist
			
 
				-import time
			
 
				-
			
 
				 import gradio as gr
			
 
				 
			
 
				-
			
 
				-def add_generation_specific_args(parser):
			
 
				-    parser.add_argument("--sampling-strategy", type=str, default="BaseStrategy", help="Type of sampling strategy.")
			
 
				-    parser.add_argument("--min-gen-length", type=int, default=0, help="The minimum length each blank should generate.")
			
 
				-    parser.add_argument(
			
 
				-        "--print-all-beams", action="store_true", help="Print all output generated by beam search strategy."
			
 
				-    )
			
 
				-
			
 
				-
			
 
				-def isEnglish(s):
			
 
				-    try:
			
 
				-        s.encode(encoding="utf-8").decode("ascii")
			
 
				-    except UnicodeDecodeError:
			
 
				-        return False
			
 
				-    else:
			
 
				-        return True
			
 
				-
			
 
				-
			
 
				-def get_masks_and_position_ids(seq, mask_position, max_gen_length, gmask=False):
			
 
				-    context_length = seq.shape[1]
			
 
				-    tokens = torch.nn.functional.pad(seq, (0, max_gen_length), mode="constant", value=-1)
			
 
				-    attention_mask = torch.ones((1, tokens.shape[-1], tokens.shape[-1]), device=tokens.device)
			
 
				-    attention_mask.tril_()
			
 
				-    attention_mask[..., : context_length - 1] = 1
			
 
				-    attention_mask.unsqueeze_(1)
			
 
				-    attention_mask = (attention_mask < 0.5).bool()
			
 
				-
			
 
				-    position_ids = torch.arange(tokens.shape[-1], dtype=torch.long, device=tokens.device)
			
 
				-    if not gmask:
			
 
				-        position_ids[context_length - 1 :] = mask_position
			
 
				-
			
 
				-    position_ids = position_ids.unsqueeze(0)
			
 
				-
			
 
				-    return tokens, attention_mask, position_ids
			
 
				-
			
 
				-
			
 
				-def fill_blanks(raw_text: str, model, tokenizer, strategy) -> Tuple[List[str], List[str], List[List[str]]]:
			
 
				-    # add MASK
			
 
				-    generation_mask = "[gMASK]"
			
 
				-    if "[MASK]" in raw_text:
			
 
				-        generation_mask = "[MASK]"
			
 
				-    elif "[sMASK]" in raw_text:
			
 
				-        generation_mask = "[sMASK]"
			
 
				-    use_gmask = "[MASK]" not in raw_text and "[sMASK]" not in raw_text
			
 
				-
			
 
				-    mask_pattern = r"\[[sg]?MASK\]"
			
 
				-    text_list = re.split(mask_pattern, raw_text)
			
 
				-    pattern_list = re.compile(mask_pattern).findall(raw_text)
			
 
				-    seq = []
			
 
				-    for i in range(len(pattern_list)):
			
 
				-        pattern = pattern_list[i]
			
 
				-        sub_text = text_list[i]
			
 
				-        seq.extend(tokenizer.tokenize(sub_text))
			
 
				-        seq.append(tokenizer.get_command(pattern))
			
 
				-
			
 
				-    seq.extend(tokenizer.tokenize(text_list[-1]))
			
 
				-
			
 
				-    if "MASK]" not in raw_text:
			
 
				-        seq += [tokenizer.get_command(generation_mask)]
			
 
				-        raw_text += " " + generation_mask
			
 
				-    if not raw_text.endswith("MASK]"):
			
 
				-        seq = seq + [tokenizer.get_command("eos")]
			
 
				-    if mpu.get_model_parallel_rank() == 0:
			
 
				-        print("\nInput: {}\n".format(raw_text))
			
 
				-    if len(seq) > args.max_sequence_length:
			
 
				-        raise ValueError("text too long.")
			
 
				-
			
 
				-    # generation
			
 
				-    is_english = isEnglish(raw_text)
			
 
				-    output_list = [seq]
			
 
				-    num_output = args.num_beams if args.sampling_strategy == "BeamSearchStrategy" else 1
			
 
				-    last_pos, answers, answers_with_style, blanks = (
			
 
				-        [0] * num_output,
			
 
				-        ["" for _ in range(num_output)],
			
 
				-        ["" for _ in range(num_output)],
			
 
				-        [[] for _ in range(num_output)],
			
 
				-    )
			
 
				-
			
 
				-    # continually detect the first mark position
			
 
				-    while True:
			
 
				-        seq = output_list[0]
			
 
				-        # detect mask position
			
 
				-        mask_token = tokenizer.get_command(generation_mask)
			
 
				-        if mask_token not in seq:
			
 
				-            break
			
 
				-        mask_position = seq.index(mask_token)
			
 
				-
			
 
				-        output_list = []
			
 
				-
			
 
				-        input_seq = torch.cuda.LongTensor(
			
 
				-            [seq + [tokenizer.get_command("sop")]],
			
 
				-            device=args.device,
			
 
				-        )
			
 
				-        output, _ = batch_filling_sequence(
			
 
				-            model,
			
 
				-            input_seq,
			
 
				-            torch.cuda.LongTensor([input_seq.shape[-1]], device=args.device),
			
 
				-            strategy=strategy,
			
 
				-            get_masks_and_position_ids=partial(
			
 
				-                get_masks_and_position_ids,
			
 
				-                mask_position=mask_position,
			
 
				-                max_gen_length=args.out_seq_length,
			
 
				-                gmask=use_gmask,
			
 
				-            ),
			
 
				-        )
			
 
				-        if isinstance(output, torch.Tensor):  # different strategies
			
 
				-            output = output.tolist()
			
 
				-        output = output[0]  # batch_size = 1
			
 
				-        output_list.extend(output)
			
 
				-
			
 
				-        # clip -1s and fill back generated things into seq
			
 
				-        for i in range(len(output_list)):
			
 
				-            output = output_list[i].tolist() if isinstance(output_list[i], torch.Tensor) else output_list[i]
			
 
				-            try:
			
 
				-                unfinished = output.index(-1)
			
 
				-            except ValueError:
			
 
				-                unfinished = len(output)
			
 
				-            if output[unfinished - 1] in strategy.end_tokens:
			
 
				-                unfinished -= 1
			
 
				-            bog = output.index(tokenizer.get_command("sop"))
			
 
				-
			
 
				-            prefix = tokenizer.detokenize(output[last_pos[i] : mask_position])
			
 
				-            blank = tokenizer.detokenize(output[bog + 1 : unfinished])
			
 
				-            answers_with_style[i] += (
			
 
				-                prefix
			
 
				-                + (" " if is_english else "")
			
 
				-                + ("\033[4m" if use_gmask else "\x1b[0;32m\033[4m")
			
 
				-                + blank
			
 
				-                + ("\033[0m" if use_gmask else "\033[0m\x1b[0m")
			
 
				-                + (" " if is_english else "")
			
 
				-            )
			
 
				-            blanks[i].append(blank)
			
 
				-            last_pos[i] = mask_position + unfinished - (bog + 1)
			
 
				-            output_list[i] = output[:mask_position] + output[bog + 1 : unfinished] + output[mask_position + 1 : bog]
			
 
				-
			
 
				-    for i, output in enumerate(output_list):
			
 
				-        if output[-1] == tokenizer.get_command("eos"):
			
 
				-            output = output[:-1]
			
 
				-        answers_with_style[i] += tokenizer.detokenize(output[last_pos[i] :])
			
 
				-        answers[i] = tokenizer.detokenize(output)
			
 
				-
			
 
				-    return answers, answers_with_style, blanks
			
 
				+from generation import BeamSearchStrategy, BaseStrategy
			
 
				+from initialize import initialize, initialize_model_and_tokenizer
			
 
				+from generate import add_generation_specific_args, fill_blanks
			
 
				 
			
 
				 
			
 
				 def generate_continually(func, raw_text):
			
--- a/tasks/cot/task.py
+++ b/tasks/cot/task.py
@@ -3,6 +3,7 @@ import json
 
				 import re
			
 
				 from typing import Union, List, Dict, Callable
			
 
				 from datetime import datetime
			
 
				+from evaluation.model import ModelForEvaluation
			
 
				 from evaluation.tasks import GenerationTask, GenerationTaskDataset, GenerationTaskConfig
			
 
				 from evaluation.utils import print_rank_0
			
 
				 from dataclasses import dataclass
			
@@ -116,14 +117,14 @@ def extract_answer(prediction, task_name, chain_of_thought=True):
 
				 class ChainOfThoughtDataset(GenerationTaskDataset):
			
 
				     config: ChainOfThoughtConfig
			
 
				 
			
 
				-    def __init__(self, path: Union[str, List[str]], config: ChainOfThoughtConfig):
			
 
				+    def __init__(self, path: Union[str, List[str]], model: ModelForEvaluation, config: ChainOfThoughtConfig):
			
 
				         self.labeled_examples = read_examples(config.prompt_path)
			
 
				         self.labeled_prompt = build_prompt(
			
 
				             self.labeled_examples, config.name, chain_of_thought=config.chain_of_thought, prompt_type=config.prompt_type
			
 
				         )
			
 
				         # print_rank_0(self.labeled_prompt)
			
 
				         self.printed_count = 0
			
 
				-        super().__init__(path, config)
			
 
				+        super().__init__(path, model, config)
			
 
				         # print_rank_0(len(self.tokenizer.tokenize(self.labeled_prompt)))
			
 
				 
			
 
				     def process_single_item(self, item, **kwargs):
			
@@ -209,15 +210,15 @@ class ChainOfThoughtTask(GenerationTask):
 
				 
			
 
				     def build_dataset(self, relative_path):
			
 
				         if self.config.name.startswith("gsm8k"):
			
 
				-            return GSM8KDataset(os.path.join(self.config.path, relative_path), self.config)
			
 
				+            return GSM8KDataset(os.path.join(self.config.path, relative_path), self.model, self.config)
			
 
				         elif self.config.name.startswith("sports"):
			
 
				-            return SportsDataset(os.path.join(self.config.path, relative_path), self.config)
			
 
				+            return SportsDataset(os.path.join(self.config.path, relative_path), self.model, self.config)
			
 
				         elif self.config.name.startswith("lastletter"):
			
 
				-            return LastLetterDataset(os.path.join(self.config.path, relative_path), self.config)
			
 
				+            return LastLetterDataset(os.path.join(self.config.path, relative_path), self.model, self.config)
			
 
				         elif self.config.name.startswith("coinflip") or self.config.name.startswith("reverse"):
			
 
				-            return ChainOfThoughtDataset(os.path.join(self.config.path, relative_path), self.config)
			
 
				+            return ChainOfThoughtDataset(os.path.join(self.config.path, relative_path), self.model, self.config)
			
 
				         elif self.config.name.startswith("date"):
			
 
				-            return DateDataset(os.path.join(self.config.path, relative_path), self.config)
			
 
				+            return DateDataset(os.path.join(self.config.path, relative_path), self.model, self.config)
			
 
				         else:
			
 
				             raise NotImplementedError
			
 
				 
			
--- a/tasks/ethnic/crows-pair/tasks.py
+++ b/tasks/ethnic/crows-pair/tasks.py
@@ -19,7 +19,7 @@ class CrowsPairTask(MultiChoiceTask, ABC):
 
				     config: MultiChoiceTaskConfig
			
 
				 
			
 
				     def build_dataset(self, relative_path):
			
 
				-        return CrowsPairDataset(join(self.config.path, relative_path), self.config)
			
 
				+        return CrowsPairDataset(join(self.config.path, relative_path), self.model, self.config)
			
 
				 
			
 
				     def predict_single_batch(self, batch) -> List[int]:
			
 
				         log_probs = self.model.cond_log_prob(batch)
			
@@ -69,10 +69,10 @@ class CrowsPairDataset(MultiChoiceTaskDataset):
 
				 
			
 
				     config: MultiChoiceTaskConfig
			
 
				 
			
 
				-    def __init__(self, path, config: MultiChoiceTaskConfig):
			
 
				+    def __init__(self, path, model, config: MultiChoiceTaskConfig):
			
 
				         self.is_single_token = True  # set to False later in process_single_item func
			
 
				         self.eval_data = []
			
 
				-        super().__init__(path, config)
			
 
				+        super().__init__(path, model, config)
			
 
				 
			
 
				     def process_single_item(self, item):
			
 
				         text, choices, label = (
			
--- a/tasks/ethnic/stereoset/tasks.py
+++ b/tasks/ethnic/stereoset/tasks.py
@@ -20,7 +20,7 @@ class StereoSetTask(MultiChoiceTask, ABC):
 
				     config: MultiChoiceTaskConfig
			
 
				 
			
 
				     def build_dataset(self, relative_path):
			
 
				-        return StereoSetDataset(join(self.config.path, relative_path), self.config)
			
 
				+        return StereoSetDataset(join(self.config.path, relative_path), self.model, self.config)
			
 
				 
			
 
				     def predict_single_batch(self, batch) -> List[int]:
			
 
				         log_probs = self.model.cond_log_prob(batch)
			
@@ -84,10 +84,10 @@ class StereoSetTask(MultiChoiceTask, ABC):
 
				 class StereoSetDataset(MultiChoiceTaskDataset):
			
 
				     config: MultiChoiceTaskConfig
			
 
				 
			
 
				-    def __init__(self, path, config: MultiChoiceTaskConfig):
			
 
				+    def __init__(self, path, model, config: MultiChoiceTaskConfig):
			
 
				         self.is_single_token = True  # set to False later in process_single_item func
			
 
				         self.eval_data = []
			
 
				-        super().__init__(path, config)
			
 
				+        super().__init__(path, model, config)
			
 
				 
			
 
				     def process_single_item(self, item):
			
 
				         text, choices, label = (
			
--- a/tasks/language-modeling/pile.py
+++ b/tasks/language-modeling/pile.py
@@ -33,7 +33,7 @@ class Pile(LanguageModelTask):
 
				         return {"BPB": calculate_bpb_score}
			
 
				 
			
 
				     def build_dataset(self, relative_path):
			
 
				-        return PileDataset(join(self.config.path, relative_path), self.config)
			
 
				+        return PileDataset(join(self.config.path, relative_path), self.model, self.config)
			
 
				 
			
 
				     def report_single_metrics(self, file: str, result_dict: Dict[str, float]):
			
 
				         pass