3 years ago · 2cbe915398
--- a/configs/model_glm_130b.sh
+++ b/configs/model_glm_130b.sh
@@ -1,5 +1,5 @@
 
															 MODEL_TYPE="glm-130b"
														
 
															-CHECKPOINT_PATH="/thudm/workspace/hanyu/SwissArmyTransformer/data/ckpt/iter_0049300"
														
 
															+CHECKPOINT_PATH="/zhangpai21/checkpoints/glm-130b-sat"
														
 
															 MP_SIZE=8
														
 
															 MODEL_ARGS="--model-parallel-size ${MP_SIZE} \
														
 
															             --num-layers 70 \
														
--- a/evaluation/configs.py
+++ b/evaluation/configs.py
@@ -25,6 +25,7 @@ class BaseConfig(YAMLWizard):
 
															     unidirectional: bool = False  # Whether to use unidirectional attention
														
 
															     max_seq_length: int = 2048  # Max sequence length
														
 
															     file_pattern: str | Dict[str, str] = "**/*.json*"  # Organize data file in groups
														
 
															+    save_prediction: bool = False
														
 
															     micro_batch_size: int = 1  # 'gen' task only support mbs = 1 for now
														
@@ -41,13 +42,14 @@ class MultiChoiceTaskConfig(BaseConfig):
 
															 @dataclass
														
 
															 class GenerationTaskConfig(BaseConfig):
														
 
															     module = "evaluation.GenerationTask"
														
 
															-    metrics: List[str] = field(default_factory=lambda: ["EM", "F1"])
														
 
															+    metrics: List[str] = field(default_factory=lambda: [])
														
 
															     sampling_strategy: str = "BaseStrategy"
														
 
															     num_beams: int = 4
														
 
															     length_penalty: float = 1.0
														
 
															     no_repeat_ngram_size: int = 3
														
 
															     min_gen_length: int = 0
														
 
															     max_gen_length: int = 128
														
 
															+    end_tokens: List[str] = field(default_factory=lambda: [])
														
 
															     def __post_init__(self):
														
 
															         assert self.micro_batch_size == 1, "Only support micro batch size = 1 for generation task"
														
--- a/evaluation/dataset.py
+++ b/evaluation/dataset.py
@@ -4,6 +4,7 @@ import json
 
															 import numpy as np
														
 
															 import torch
														
 
															+from typing import List
														
 
															 from abc import ABC, abstractmethod
														
 
															 from scipy.linalg import block_diag
														
@@ -46,10 +47,16 @@ class EvaluationDataset(torch.utils.data.Dataset, ABC):
 
															         self.gmask_id = tokenizer.get_command("[gMASK]")
														
 
															         self.data = []
														
 
															-        with open(os.path.join(path), "r", encoding="utf-8") as file:
														
 
															-            for line in file:
														
 
															-                item = json.loads(line)
														
 
															-                self.data.append(self.process_single_item(item))
														
 
															+        if path.endswith("jsonl"):
														
 
															+            with open(os.path.join(path), "r", encoding="utf-8") as file:
														
 
															+                for line in file:
														
 
															+                    item = json.loads(line)
														
 
															+                    self.data.extend(self.process_single_item(item))
														
 
															+        elif path.endswith("json"):
														
 
															+            with open(os.path.join(path), "r", encoding="utf-8") as file:
														
 
															+                dataset = json.load(file)
														
 
															+            for item in dataset:
														
 
															+                self.data.extend(self.process_single_item(item))
														
 
															     @property
														
 
															     def has_collate_fn(self) -> bool:
														
@@ -59,7 +66,7 @@ class EvaluationDataset(torch.utils.data.Dataset, ABC):
 
															         return None
														
 
															     @abstractmethod
														
 
															-    def process_single_item(self, item) -> dict:
														
 
															+    def process_single_item(self, item, **kwargs) -> List[dict]:
														
 
															         pass
														
 
															     def __len__(self):
														
@@ -69,12 +76,12 @@ class EvaluationDataset(torch.utils.data.Dataset, ABC):
 
															 class GenerationTaskDataset(EvaluationDataset):
														
 
															     config: GenerationTaskConfig
														
 
															-    def process_single_item(self, item):
														
 
															+    def process_single_item(self, item, **kwargs):
														
 
															         text, targets = get_tokenized_input(item, "inputs"), get_tokenized_input(item, "targets")
														
 
															         if len(text) + self.config.max_gen_length + 2 > self.config.max_seq_length:
														
 
															             text_length = self.config.max_seq_length - self.config.max_gen_length - 2
														
 
															             text = text[len(text) - text_length : len(text)]
														
 
															-        return {"text": text, "targets": targets}
														
 
															+        return [{"text": text, "targets": targets, **kwargs}]
														
 
															     @staticmethod
														
 
															     def build_generation_sample(text, max_gen_length, use_task_mask, unidirectional=True):
														
@@ -124,7 +131,8 @@ class GenerationTaskDataset(EvaluationDataset):
 
															             use_task_mask=self.config.use_task_mask,
														
 
															             unidirectional=self.config.unidirectional,
														
 
															         )
														
 
															-        sample["targets"] = [np.array(target, dtype=self.dtype) for target in item["targets"]]
														
 
															+        if "target" in item:
														
 
															+            sample["targets"] = [np.array(target, dtype=self.dtype) for target in item["targets"]]
														
 
															         return sample
														
@@ -165,7 +173,7 @@ class MultiChoiceTaskDataset(EvaluationDataset):
 
															             "is_single_token": self.is_single_token,
														
 
															         }
														
 
															-    def process_single_item(self, item):
														
 
															+    def process_single_item(self, item, **kwargs):
														
 
															         text, choices, label = get_tokenized_input(item, "inputs"), get_tokenized_input(item, "choices"), item["label"]
														
 
															         tgt_seq_length = sum([len(choice) for choice in choices])
														
@@ -185,11 +193,12 @@ class MultiChoiceTaskDataset(EvaluationDataset):
 
															         if tgt_seq_length != 1:
														
 
															             self.is_single_token = False
														
 
															-        return {
														
 
															+        return [{
														
 
															             "text": text,
														
 
															             "choices": choices,
														
 
															             "label": label,
														
 
															-        }
														
 
															+            **kwargs
														
 
															+        }]
														
 
															     @staticmethod
														
 
															     def build_multiple_choice_sample(text, choices, is_single_token, unified_multitask_encoding=False):
														
@@ -216,6 +225,8 @@ class MultiChoiceTaskDataset(EvaluationDataset):
 
															         attention_mask = [np.ones((len(token), len(token)), dtype=np.int64)]
														
 
															         for choice in choices:
														
 
															+            if not choice:
														
 
															+                choice = [tokenizer.get_command('eop')]
														
 
															             position_id = np.concatenate(
														
 
															                 (
														
 
															                     position_id,
														
--- a/evaluation/tasks.py
+++ b/evaluation/tasks.py
@@ -2,6 +2,7 @@ import torch
 
															 import time
														
 
															 import numpy as np
														
 
															 import torch.distributed as dist
														
 
															+from tqdm import tqdm
														
 
															 from typing import Dict, Callable, Type, Tuple, List, Any
														
 
															 from abc import ABC, abstractmethod
														
@@ -42,6 +43,10 @@ class BaseTask(ABC):
 
															         self.file_groups = self.get_file_groups()
														
 
															         self.verbose = dist.get_rank() == 0
														
 
															+        self.save_prediction = config.save_prediction
														
 
															+
														
 
															+    def save_prediction_to_file(self, file, prediction, data):
														
 
															+        pass
														
 
															     def get_file_groups(self):
														
 
															         pattern_group = {}
														
@@ -71,7 +76,7 @@ class BaseTask(ABC):
 
															             result_dict_group = {}
														
 
															             for file in filelist:
														
 
															-                dataset = self.build_dataset(file)
														
 
															+                dataset = self.build_dataset(file, group_name)
														
 
															                 dataloader = build_data_loader(
														
 
															                     dataset,
														
 
															                     micro_batch_size=self.config.micro_batch_size,
														
@@ -81,13 +86,18 @@ class BaseTask(ABC):
 
															                 )
														
 
															                 prediction = []
														
 
															+                tqdm_wrapper = tqdm if torch.distributed.get_rank() == 0 else lambda x:x
														
 
															                 with torch.no_grad():
														
 
															-                    for _, batch in enumerate(dataloader):
														
 
															-                        prediction.append(self.predict_single_batch(batch))
														
 
															+                    for idx, batch in tqdm_wrapper(enumerate(dataloader)):
														
 
															+                        p_batch = self.predict_single_batch(batch)
														
 
															+                        prediction.append(p_batch)
														
 
															+
														
 
															                 prediction = gather_result(prediction, len(dataset), self.config.micro_batch_size)
														
 
															                 result_dict = {key: metric(prediction, dataset.data) for key, metric in self.metrics.items()}
														
 
															                 result_dict_group[file] = (result_dict, len(dataset))
														
 
															+                if torch.distributed.get_rank() == 0 and self.save_prediction:
														
 
															+                    self.save_prediction_to_file(file, prediction, dataset.data)
														
 
															                 if self.verbose:
														
 
															                     self.report_single_metrics(file, result_dict)
														
@@ -152,7 +162,7 @@ class BaseTask(ABC):
 
															         pass
														
 
															     @abstractmethod
														
 
															-    def build_dataset(self, relative_path: str) -> EvaluationDataset:
														
 
															+    def build_dataset(self, relative_path: str, split: str) -> EvaluationDataset:
														
 
															         pass
														
@@ -163,13 +173,17 @@ class GenerationTask(BaseTask, ABC):
 
															     def config_class(cls):
														
 
															         return GenerationTaskConfig
														
 
															-    def build_dataset(self, relative_path):
														
 
															+    def build_dataset(self, relative_path, split):
														
 
															         return GenerationTaskDataset(join(self.config.path, relative_path), self.config)
														
 
															     def __init__(self, model: ModelForEvaluation, tokenizer: _IceTokenizer, config: GenerationTaskConfig):
														
 
															         super(GenerationTask, self).__init__(model, tokenizer, config)
														
 
															         end_tokens = [tokenizer.get_command("eop"), tokenizer.get_command("eos")]
														
 
															+        if self.config.end_tokens:
														
 
															+            for token in self.config.end_tokens:
														
 
															+                end_tokens.append(self.tokenizer.tokenize(token)[-1])
														
 
															+            print_rank_0(f"End tokens {end_tokens}")
														
 
															         if self.config.sampling_strategy == "BaseStrategy":
														
 
															             self.strategy = BaseStrategy(temperature=1.0, top_k=1, end_tokens=end_tokens)
														
 
															         elif self.config.sampling_strategy == "BeamSearchStrategy":
														
@@ -180,7 +194,7 @@ class GenerationTask(BaseTask, ABC):
 
															                 end_tokens=end_tokens,
														
 
															                 no_repeat_ngram_size=self.config.no_repeat_ngram_size,
														
 
															                 min_gen_length=self.config.min_gen_length,
														
 
															-                deterministic=True,  # For evaluation, we need a determined generation strategy
														
 
															+                deterministic=False,  # For evaluation, we need a determined generation strategy
														
 
															             )
														
 
															         else:
														
 
															             raise ValueError(f"unknown strategy {self.config.sampling_strategy}")
														
@@ -199,7 +213,7 @@ class MultiChoiceTask(BaseTask, ABC):
 
															     def config_class(cls):
														
 
															         return MultiChoiceTaskConfig
														
 
															-    def build_dataset(self, relative_path):
														
 
															+    def build_dataset(self, relative_path, split):
														
 
															         return MultiChoiceTaskDataset(join(self.config.path, relative_path), self.config)
														
 
															     def predict_single_batch(self, batch) -> List[int]:
														
--- a/scripts/evaluate.sh
+++ b/scripts/evaluate.sh
@@ -6,7 +6,7 @@ main_dir=$(dirname $script_dir)
 
															 source "${main_dir}/configs/model_glm_130b.sh"
														
 
															-DATA_PATH="<your evaluation dataset base directory>"
														
 
															+DATA_PATH="/zhangpai21/workspace/zxdu"
														
 
															 ARGS="${main_dir}/evaluate.py \
														
 
															        --mode inference \