3 years ago · b030789021
--- a/evaluation/configs.py
+++ b/evaluation/configs.py
@@ -42,7 +42,7 @@ class MultiChoiceTaskConfig(BaseConfig):
 
															 @dataclass
														
 
															 class GenerationTaskConfig(BaseConfig):
														
 
															     module = "evaluation.GenerationTask"
														
 
															-    metrics: List[str] = field(default_factory=lambda: ["EM", "F1"])
														
 
															+    metrics: List[str] = field(default_factory=lambda: [])
														
 
															     sampling_strategy: str = "BaseStrategy"
														
 
															     num_beams: int = 4
														
 
															     length_penalty: float = 1.0
														
--- a/evaluation/dataset.py
+++ b/evaluation/dataset.py
@@ -18,14 +18,15 @@ from .utils import get_tokenized_input
 
															 def pad_batch(tokens, position_ids, attention_mask, max_seq_length):
														
 
															+    pad_length = max_seq_length - len(tokens)
														
 
															     attention_mask = np.pad(
														
 
															         attention_mask,
														
 
															-        pad_width=((0, max_seq_length - len(tokens)),),
														
 
															+        pad_width=((0, pad_length),),
														
 
															         mode="constant",
														
 
															         constant_values=0,
														
 
															     )
														
 
															-    tokens = np.concatenate((tokens, np.zeros(max_seq_length - len(tokens), dtype=np.int64)))
														
 
															-    position_ids = np.concatenate((position_ids, np.zeros(max_seq_length - len(position_ids), dtype=np.int64)))
														
 
															+    tokens = np.concatenate((tokens, np.zeros(pad_length, dtype=np.int64)))
														
 
															+    position_ids = np.concatenate((position_ids, position_ids[..., -1:].repeat(pad_length, -1)), axis=-1)
														
 
															     return tokens, position_ids, attention_mask
														
@@ -166,6 +167,60 @@ class GenerationTaskDataset(EvaluationDataset):
 
															         )
														
 
															+class SmallGenerationTaskDataset(GenerationTaskDataset):
														
 
															+    config: GenerationTaskConfig
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def build_generation_sample(text, max_gen_length, use_task_mask, unidirectional=True):
														
 
															+        tokenizer = get_tokenizer()
														
 
															+
														
 
															+        sop_id = tokenizer.get_command("sop")
														
 
															+        mask_id = tokenizer.get_command("[gMASK]").Id if use_task_mask else tokenizer.get_command("[MASK]").Id
														
 
															+        cls_id = tokenizer.get_command("ENC")
														
 
															+        eos_id = tokenizer.get_command("eos")
														
 
															+
														
 
															+        token = np.array(text, dtype=np.int64)
														
 
															+
														
 
															+        blank_filling = mask_id in text
														
 
															+        if blank_filling:
														
 
															+            assert not unidirectional, "Unidirectional attention doesn't support blank filling"
														
 
															+            assert not use_task_mask, "Unidirectional attention doesn't support task mask"
														
 
															+            mask_position = text.index(mask_id) + 1
														
 
															+            context_length = len(token) + 2
														
 
															+            token = np.concatenate(([cls_id], token, [eos_id, sop_id]))
														
 
															+        else:
														
 
															+            if unidirectional:
														
 
															+                mask_position = 1
														
 
															+                context_length = 3
														
 
															+                token = np.concatenate(([cls_id, mask_id, eos_id, sop_id], token))
														
 
															+            else:
														
 
															+                mask_position = len(token) + 1
														
 
															+                context_length = len(token) + 3
														
 
															+                token = np.concatenate(([cls_id], token, [mask_id, eos_id, sop_id]))
														
 
															+        prefix_length = len(token) - context_length
														
 
															+
														
 
															+        position_id = [list(range(context_length)) + [mask_position] * prefix_length,
														
 
															+                       [0] * context_length + list(range(1, prefix_length + 1))]
														
 
															+        position_id = np.array(position_id, dtype=np.int64)
														
 
															+
														
 
															+        target_position_id = [[mask_position] * max_gen_length,
														
 
															+                              list(range(prefix_length + 1, prefix_length + max_gen_length + 1))]
														
 
															+        target_position_id = np.array(target_position_id, dtype=np.int64)
														
 
															+
														
 
															+        attention_mask = np.tril(np.ones((len(token), len(token)), dtype=np.int64))
														
 
															+        if not unidirectional:
														
 
															+            attention_mask[: len(token) - 1, : len(token) - 1] = 1
														
 
															+
														
 
															+        item = {
														
 
															+            "token": token,
														
 
															+            "position_id": position_id,
														
 
															+            "target_position_id": target_position_id,
														
 
															+            "attention_mask": attention_mask,
														
 
															+            "context_length": context_length,
														
 
															+        }
														
 
															+        return item
														
 
															+
														
 
															+
														
 
															 class MultiChoiceTaskDataset(EvaluationDataset):
														
 
															     config: MultiChoiceTaskConfig
														
--- a/evaluation/metrics.py
+++ b/evaluation/metrics.py
@@ -72,8 +72,8 @@ def qa_evaluate(predictions, examples, metric):
 
															     score = 0.0
														
 
															     for example, prediction in zip(examples, predictions):
														
 
															-        ground_truths = [tokenizer.tokenizer.decode(target) for target in example["targets"]]
														
 
															-        prediction = tokenizer.tokenizer.decode(prediction)
														
 
															+        ground_truths = [tokenizer.detokenize(target) for target in example["targets"]]
														
 
															+        prediction = tokenizer.detokenize(prediction)
														
 
															         if ground_truths:
														
 
															             score += metric_max_over_ground_truths(metric, prediction, ground_truths)
														
 
															     score = 100.0 * score / len(predictions)
														
--- a/evaluation/model.py
+++ b/evaluation/model.py
@@ -66,7 +66,8 @@ def batch_filling_sequence(
 
															         tokens, mems = strategy.forward(logits, tokens, mems)
														
 
															         if len(tokens.shape) == 3 and num_beams == 1:
														
 
															             num_beams = tokens.shape[1]
														
 
															-            position_ids = position_ids.unsqueeze(1).expand(batch_size, num_beams, -1).reshape(batch_size * num_beams, -1)
														
 
															+            tail_size = position_ids.shape[1:]
														
 
															+            position_ids = position_ids.unsqueeze(1).expand(batch_size, num_beams, *tail_size).reshape(batch_size * num_beams, *tail_size)
														
 
															             attention_mask_shape = attention_mask.shape[-3:]
														
 
															             attention_mask = attention_mask.unsqueeze(1).expand(batch_size, num_beams, -1, -1, -1).reshape(
														
 
															                 batch_size * num_beams, *attention_mask_shape)
														
--- a/evaluation/tasks.py
+++ b/evaluation/tasks.py
@@ -14,7 +14,7 @@ from SwissArmyTransformer.tokenization.icetk_glm_130B.ice_tokenizer import _IceT
 
															 from generation import BaseStrategy, BeamSearchStrategy
														
 
															 from .configs import BaseConfig, GenerationTaskConfig, MultiChoiceTaskConfig, LanguageModelTaskConfig
														
 
															 from .model import ModelForEvaluation
														
 
															-from .dataset import EvaluationDataset, GenerationTaskDataset, MultiChoiceTaskDataset, LanguageModelTaskDataset
														
 
															+from .dataset import EvaluationDataset, GenerationTaskDataset, MultiChoiceTaskDataset, LanguageModelTaskDataset, SmallGenerationTaskDataset
														
 
															 from .utils import build_data_loader, gather_result, print_rank_0
														
 
															 from .metrics import DEFAULT_METRICS
														
@@ -163,7 +163,7 @@ class GenerationTask(BaseTask, ABC):
 
															         return GenerationTaskConfig
														
 
															     def build_dataset(self, relative_path):
														
 
															-        return GenerationTaskDataset(join(self.config.path, relative_path), self.config)
														
 
															+        return SmallGenerationTaskDataset(join(self.config.path, relative_path), self.config)
														
 
															     def __init__(self, model: ModelForEvaluation, tokenizer: _IceTokenizer, config: GenerationTaskConfig):
														
 
															         super(GenerationTask, self).__init__(model, tokenizer, config)
														
--- a/initialize.py
+++ b/initialize.py
@@ -7,7 +7,8 @@ from quantization import quantize
 
															 from SwissArmyTransformer import get_args, get_tokenizer
														
 
															 from SwissArmyTransformer.arguments import initialize_distributed
														
 
															 from SwissArmyTransformer.training import load_checkpoint
														
 
															-from SwissArmyTransformer.model import GLM130B
														
 
															+from SwissArmyTransformer.model import GLM130B, GLMModel
														
 
															+from SwissArmyTransformer.model.mixins import CachedAutoregressiveMixin
														
 
															 def add_bminf_args(parser):
														
@@ -31,6 +32,7 @@ def initialize(extra_args_provider):
 
															     add_bminf_args(parser)
														
 
															     add_quantization_args(parser)
														
 
															     GLM130B.add_model_specific_args(parser)
														
 
															+    GLMModel.add_model_specific_args(parser)
														
 
															     extra_args_provider(parser)
														
 
															     known, args_list = parser.parse_known_args()
														
 
															     args = get_args(args_list)
														
@@ -40,11 +42,32 @@ def initialize(extra_args_provider):
 
															     return args
														
 
															-def initialize_model_and_tokenizer(args):
														
 
															-    tokenizer = get_tokenizer(args)
														
 
															+class SmallTokenizer:
														
 
															+    def __init__(self, tokenizer):
														
 
															+        self.tokenizer = tokenizer
														
 
															+
														
 
															+    def tokenize(self, text):
														
 
															+        return self.tokenizer.EncodeAsIds(text).tokenization
														
 
															+
														
 
															+    def detokenize(self, ids):
														
 
															+        return self.tokenizer.DecodeIds(ids)
														
 
															+
														
 
															+    def get_command(self, name):
														
 
															+        map = {"[MASK]": "MASK", "[gMASK]": "gMASK", "[sMASK]": "sMASK"}
														
 
															+        if name in map:
														
 
															+            name = map[name]
														
 
															+        return self.tokenizer.get_command(name).Id
														
 
															+
														
 
															+def initialize_model_and_tokenizer(args):
														
 
															+    if args.tokenizer_type.startswith("glm_"):
														
 
															+        tokenizer = SmallTokenizer(get_tokenizer(args))
														
 
															+        tokenizer = get_tokenizer(args, outer_tokenizer=tokenizer)
														
 
															+    else:
														
 
															+        tokenizer = get_tokenizer(args)
														
 
															     # Initialize model
														
 
															-    model = GLM130B(args).half()
														
 
															+    model = GLMModel(args).half()
														
 
															+    model.add_mixin('cached-autoregressive', CachedAutoregressiveMixin())
														
 
															     if args.from_quantized_checkpoint:
														
 
															         assert args.quantization_bit_width is not None
														
@@ -77,12 +100,12 @@ def initialize_model_and_tokenizer(args):
 
															     model.eval()
														
 
															     # generate rotary embedding cache
														
 
															-    with torch.no_grad():
														
 
															-        _, *_ = model(
														
 
															-            torch.ones(1, 1, device=torch.cuda.current_device(), dtype=torch.int64),
														
 
															-            torch.ones(1, 1, device=torch.cuda.current_device(), dtype=torch.int64) * args.max_sequence_length,
														
 
															-            torch.ones(1, 1, 1, 1, device=torch.cuda.current_device(), dtype=torch.bool),
														
 
															-        )
														
 
															+    # with torch.no_grad():
														
 
															+    #     _, *_ = model(
														
 
															+    #         torch.ones(1, 1, device=torch.cuda.current_device(), dtype=torch.int64),
														
 
															+    #         torch.ones(1, 1, device=torch.cuda.current_device(), dtype=torch.int64) * args.max_sequence_length,
														
 
															+    #         torch.ones(1, 1, 1, 1, device=torch.cuda.current_device(), dtype=torch.bool),
														
 
															+    #     )
														
 
															     torch.distributed.barrier()