2 年之前 · 1783d9b6c5
--- a/configs/model_glm_10b.sh
+++ b/configs/model_glm_10b.sh
@@ -0,0 +1,13 @@
 
															+MODEL_TYPE="glm-10b"
														
 
															+CHECKPOINT_PATH="/zhangpai21/checkpoints/glm-10b-sat"
														
 
															+MP_SIZE=1
														
 
															+MODEL_ARGS="--model-parallel-size ${MP_SIZE} \
														
 
															+            --vocab 50304 \
														
 
															+            --num-layers 48 \
														
 
															+            --hidden-size 4096 \
														
 
															+            --num-attention-heads 64 \
														
 
															+            --max-sequence-length 1025 \
														
 
															+            --tokenizer-type glm_GPT2BPETokenizer \
														
 
															+            --tokenizer-model-type gpt2 \
														
 
															+            --task-mask \
														
 
															+            --load ${CHECKPOINT_PATH}"
														
--- a/evaluation/configs.py
+++ b/evaluation/configs.py
@@ -25,6 +25,7 @@ class BaseConfig(YAMLWizard):
 
															     use_multitask_encoding: bool = False  # Not supported now
														
 
															     unidirectional: bool = False  # Whether to use unidirectional attention
														
 
															     max_seq_length: int = 2048  # Max sequence length
														
 
															+    no_tokenized: bool = False
														
 
															     file_pattern: str | Dict[str, str] = "**/*.json*"  # Organize data file in groups
														
 
															     micro_batch_size: int = 1  # 'gen' task only support mbs = 1 for now
														
--- a/evaluation/dataset.py
+++ b/evaluation/dataset.py
@@ -168,14 +168,19 @@ class GenerationTaskDataset(EvaluationDataset):
 
															 class SmallGenerationTaskDataset(GenerationTaskDataset):
														
 
															-    config: GenerationTaskConfig
														
 
															+    def process_single_item(self, item):
														
 
															+        text, targets = get_tokenized_input(item, "inputs"), get_tokenized_input(item, "targets")
														
 
															+        if len(text) + self.config.max_gen_length + 3 > self.config.max_seq_length:
														
 
															+            text_length = self.config.max_seq_length - self.config.max_gen_length - 3
														
 
															+            text = text[len(text) - text_length : len(text)]
														
 
															+        return {"text": text, "targets": targets}
														
 
															     @staticmethod
														
 
															     def build_generation_sample(text, max_gen_length, use_task_mask, unidirectional=True):
														
 
															         tokenizer = get_tokenizer()
														
 
															         sop_id = tokenizer.get_command("sop")
														
 
															-        mask_id = tokenizer.get_command("[gMASK]").Id if use_task_mask else tokenizer.get_command("[MASK]").Id
														
 
															+        mask_id = tokenizer.get_command("[gMASK]") if use_task_mask else tokenizer.get_command("[MASK]")
														
 
															         cls_id = tokenizer.get_command("ENC")
														
 
															         eos_id = tokenizer.get_command("eos")
														
@@ -232,6 +237,10 @@ class MultiChoiceTaskDataset(EvaluationDataset):
 
															     def has_collate_fn(self) -> bool:
														
 
															         return True
														
 
															+    @staticmethod
														
 
															+    def num_special_tokens():
														
 
															+        return 2
														
 
															+
														
 
															     @staticmethod
														
 
															     def collate_fn(samples):
														
 
															         TILE = 32
														
@@ -263,7 +272,9 @@ class MultiChoiceTaskDataset(EvaluationDataset):
 
															         }
														
 
															     def process_single_item(self, item):
														
 
															-        text, choices, label = get_tokenized_input(item, "inputs"), get_tokenized_input(item, "choices"), item["label"]
														
 
															+        text = get_tokenized_input(item, "inputs", no_tokenized=self.config.no_tokenized)
														
 
															+        choices = get_tokenized_input(item, "choices", no_tokenized=self.config.no_tokenized)
														
 
															+        label = item["label"]
														
 
															         tgt_seq_length = sum([len(choice) for choice in choices])
														
 
															         if tgt_seq_length == len(choices):
														
@@ -271,9 +282,9 @@ class MultiChoiceTaskDataset(EvaluationDataset):
 
															             tgt_seq_length = 1
														
 
															         assert tgt_seq_length < self.config.max_seq_length
														
 
															-        if len(text) + tgt_seq_length + 2 > self.config.max_seq_length:
														
 
															-            text_length = self.config.max_seq_length - tgt_seq_length - 2
														
 
															-            text = text[len(text) - text_length : len(text)]
														
 
															+        if len(text) + tgt_seq_length + self.num_special_tokens() > self.config.max_seq_length:
														
 
															+            text_length = self.config.max_seq_length - tgt_seq_length - self.num_special_tokens()
														
 
															+            text = text[len(text) - text_length: len(text)]
														
 
															         assert not (
														
 
															             self.mask_id in text and self.config.use_multitask_encoding
														
@@ -354,6 +365,63 @@ class MultiChoiceTaskDataset(EvaluationDataset):
 
															         )
														
 
															+class SmallMultiChoiceTaskDataset(MultiChoiceTaskDataset):
														
 
															+    @staticmethod
														
 
															+    def num_special_tokens():
														
 
															+        return 3
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def build_multiple_choice_sample(text, choices, is_single_token, unified_multitask_encoding=False):
														
 
															+        tokenizer = get_tokenizer()
														
 
															+        cls_id = tokenizer.get_command("ENC")
														
 
															+        eos_id = tokenizer.get_command("eos")
														
 
															+        sop_id = tokenizer.get_command("sop")
														
 
															+        mask_id = tokenizer.get_command("[MASK]")
														
 
															+        blank_filling = mask_id in text
														
 
															+        if not blank_filling:
														
 
															+            text = text + [mask_id]
														
 
															+        text = [cls_id] + text + [eos_id]
														
 
															+
														
 
															+        token = np.array(text, dtype=np.int64)
														
 
															+        target = np.array(text, dtype=np.int64)
														
 
															+        position_id = np.arange(len(text), dtype=np.int64)
														
 
															+        block_position_id = np.zeros(len(text), dtype=np.int64)
														
 
															+        mask_position = text.index(mask_id)
														
 
															+        choice_target_id = []
														
 
															+
														
 
															+
														
 
															+        division = len(token)
														
 
															+        attention_mask = [np.ones((len(token), len(token)), dtype=np.int64)]
														
 
															+
														
 
															+        for choice in choices:
														
 
															+            position_id = np.concatenate((position_id, [mask_position] * len(choice)))
														
 
															+            block_position_id = np.concatenate((block_position_id, range(1, 1 + len(choice))))
														
 
															+            choice_target_id.append(np.arange(len(token), len(token) + len(choice), dtype=np.int64))
														
 
															+            attention_mask.append(np.tril(np.ones((len(choice), len(choice)), dtype=np.int64)))
														
 
															+            token = np.concatenate((token, [sop_id], choice[:-1]))
														
 
															+            target = np.concatenate((target, choice))
														
 
															+
														
 
															+            if is_single_token:
														
 
															+                break
														
 
															+
														
 
															+        attention_mask = block_diag(*attention_mask)
														
 
															+        attention_mask[: len(token), :division] = 1
														
 
															+
														
 
															+        if is_single_token:
														
 
															+            choices = np.array(choices, dtype=np.int64).squeeze().tolist()
														
 
															+
														
 
															+        position_id = np.stack((position_id, block_position_id), axis=0)
														
 
															+
														
 
															+        item = {
														
 
															+            "token": token,
														
 
															+            "position_id": position_id,
														
 
															+            "attention_mask": attention_mask,
														
 
															+            "choices": choices,
														
 
															+            "choice_target_ids": choice_target_id[0] if is_single_token else choice_target_id,
														
 
															+        }
														
 
															+        return item
														
 
															+
														
 
															+
														
 
															 class LanguageModelTaskDataset(EvaluationDataset):
														
 
															     config: LanguageModelTaskConfig
														
--- a/evaluation/tasks.py
+++ b/evaluation/tasks.py
@@ -14,7 +14,8 @@ from SwissArmyTransformer.tokenization.icetk_glm_130B.ice_tokenizer import _IceT
 
															 from generation import BaseStrategy, BeamSearchStrategy
														
 
															 from .configs import BaseConfig, GenerationTaskConfig, MultiChoiceTaskConfig, LanguageModelTaskConfig
														
 
															 from .model import ModelForEvaluation
														
 
															-from .dataset import EvaluationDataset, GenerationTaskDataset, MultiChoiceTaskDataset, LanguageModelTaskDataset, SmallGenerationTaskDataset
														
 
															+from .dataset import EvaluationDataset, GenerationTaskDataset, MultiChoiceTaskDataset, LanguageModelTaskDataset
														
 
															+from .dataset import SmallGenerationTaskDataset, SmallMultiChoiceTaskDataset
														
 
															 from .utils import build_data_loader, gather_result, print_rank_0
														
 
															 from .metrics import DEFAULT_METRICS
														
@@ -199,7 +200,7 @@ class MultiChoiceTask(BaseTask, ABC):
 
															         return MultiChoiceTaskConfig
														
 
															     def build_dataset(self, relative_path):
														
 
															-        return MultiChoiceTaskDataset(join(self.config.path, relative_path), self.config)
														
 
															+        return SmallMultiChoiceTaskDataset(join(self.config.path, relative_path), self.config)
														
 
															     def predict_single_batch(self, batch) -> List[int]:
														
 
															         log_probs = self.model.cond_log_prob(batch)
														
--- a/evaluation/utils.py
+++ b/evaluation/utils.py
@@ -52,8 +52,8 @@ def gather_result(prediction, total_length, micro_batch_size):
 
															     return prediction
														
 
															-def get_tokenized_input(item, key):
														
 
															-    if key in item:
														
 
															+def get_tokenized_input(item, key, no_tokenized=False):
														
 
															+    if key in item and not no_tokenized:
														
 
															         return item[key]
														
 
															     tokenizer = get_tokenizer()
														
 
															     pretokenized_key = key + "_pretokenized"
														
--- a/initialize.py
+++ b/initialize.py
@@ -77,7 +77,8 @@ def initialize_model_and_tokenizer(args):
 
															     # Load checkpoint
														
 
															     torch.distributed.barrier()
														
 
															     start = time.time()
														
 
															-    load_checkpoint(model, args)
														
 
															+    if args.load:
														
 
															+        load_checkpoint(model, args)
														
 
															     torch.distributed.barrier()
														
 
															     if torch.distributed.get_rank() == 0:
														
 
															         print(f"> Checkpoint loaded in {time.time() - start:.1f}s")
														
--- a/tasks/mmlu/mmlu.yaml
+++ b/tasks/mmlu/mmlu.yaml
@@ -7,4 +7,6 @@ file-pattern:
 
															   social_sciences: "social_sciences/*.json"
														
 
															   humanities: "humanities/*.json"
														
 
															   other: "other/*.json"
														
 
															-micro-batch-size: 1
														
 
															+no-tokenized: true
														
 
															+micro-batch-size: 8
														
 
															+max_seq-length: 896