2
0

task.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
  1. import os
  2. import json
  3. import time
  4. import torch
  5. import torch.distributed as dist
  6. from glob import glob
  7. from dataclasses import dataclass, field
  8. from typing import Union, List
  9. from functools import partial
  10. from tqdm import tqdm
  11. from datetime import datetime
  12. from evaluation import GenerationTask
  13. from evaluation.configs import GenerationTaskConfig
  14. from evaluation.model import ModelForEvaluation
  15. from evaluation.tasks import GenerationTask, GenerationTaskDataset
  16. from evaluation.utils import build_data_loader, gather_result, print_rank_0
  17. from SwissArmyTransformer.tokenization.icetk_glm_130B.ice_tokenizer import _IceTokenizer
  18. from .strategy import CodeBaseStrategy
  19. from .utils import LANGUAGE_TAG, cleanup_code
  20. from .metric import HumanEvalEvaluator
  21. @dataclass
  22. class HumanEvalConfig(GenerationTaskConfig):
  23. module = "tasks.humaneval.task.HumanEvalTask"
  24. language: str = 'python'
  25. num_samples: int = 200
  26. pass_k: List[int] = field(default_factory=lambda: [1, 10, 100])
  27. max_gen_length: int = 512
  28. temperature: float = 0.8
  29. top_k: int = 200
  30. top_p: float = 0
  31. class HumanEvalDataset(GenerationTaskDataset):
  32. config: HumanEvalConfig
  33. @classmethod
  34. def config_class(cls):
  35. return HumanEvalConfig
  36. def __init__(self, path: Union[str, List[str]], model: ModelForEvaluation, config: HumanEvalConfig):
  37. language = config.language.lower()
  38. self.language_prefix = ""
  39. if language in LANGUAGE_TAG:
  40. self.language_prefix = LANGUAGE_TAG[language] + '\n'
  41. super().__init__(path, model, config)
  42. def process_single_item(self, item, **kwargs):
  43. item["text"] = self.tokenizer.tokenize(self.language_prefix + item["prompt"].lstrip())
  44. return [item] * self.config.num_samples
  45. class HumanEvalTask(GenerationTask):
  46. config: HumanEvalConfig
  47. @classmethod
  48. def config_class(cls):
  49. return HumanEvalConfig
  50. @property
  51. def metrics(self):
  52. metric_dict = {}
  53. for k in self.config.pass_k:
  54. metric_dict[f'pass@{k}'] = (lambda k: (lambda predictions, examples: self.evaluator.evaluate_pass_k(predictions, examples, k)))(k)
  55. return metric_dict
  56. def build_dataset(self, relative_path):
  57. return HumanEvalDataset(os.path.join(self.config.path, relative_path), self.model, self.config)
  58. def __init__(self, model: ModelForEvaluation, tokenizer: _IceTokenizer, config: HumanEvalConfig):
  59. super(HumanEvalTask, self).__init__(model, tokenizer, config)
  60. end_tokens = [tokenizer.get_command("eop"), tokenizer.get_command("eos")]
  61. if self.config.end_tokens:
  62. for token in self.config.end_tokens:
  63. end_tokens.append(self.tokenizer.tokenize(token)[-1])
  64. if self.config.sampling_strategy == "BaseStrategy":
  65. self.strategy = CodeBaseStrategy(
  66. language=self.config.language,
  67. batch_size=self.config.micro_batch_size,
  68. temperature=self.config.temperature,
  69. top_k=self.config.top_k,
  70. top_p=self.config.top_p,
  71. end_tokens=end_tokens
  72. )
  73. # elif self.config.sampling_strategy == "BeamSearchStrategy":
  74. # self.strategy = CodeBeamSearchStrategy(
  75. # language=self.config.language,
  76. # batch_size=self.config.micro_batch_size,
  77. # num_beams=self.config.num_beams,
  78. # length_penalty=self.config.length_penalty,
  79. # consider_end=True,
  80. # end_tokens=end_tokens,
  81. # no_repeat_ngram_size=self.config.no_repeat_ngram_size,
  82. # min_gen_length=self.config.min_gen_length,
  83. # deterministic=True, # For evaluation, we need a determined generation strategy
  84. # )
  85. else:
  86. raise ValueError(f"unknown strategy {self.config.sampling_strategy}")
  87. problem_file = glob(os.path.join(self.config.path, self.config.file_pattern))[0]
  88. self.evaluator = HumanEvalEvaluator(self.config.language, problem_file, self.tokenizer)
  89. def predict_single_batch(self, batch):
  90. outputs_batch: List[List[List[int]]] = self.model.generate_text(batch, self.strategy, return_all_beams=False)
  91. predictions = []
  92. for output in outputs_batch:
  93. text = self.tokenizer.tokenizer.decode(output)
  94. print_rank_0([text])
  95. text = cleanup_code(text, self.config.language)
  96. predictions.append(self.tokenizer.tokenizer.encode(text))
  97. return predictions
  98. def evaluate(self):
  99. dist.barrier()
  100. start = time.time()
  101. print_rank_0("\n")
  102. print_rank_0(f"{self.config}")
  103. print_rank_0(f"Evaluating task {self.config.name}:")
  104. result_dict_all = {}
  105. for group_name, filelist in self.file_groups.items():
  106. print_rank_0(f" Evaluating group {group_name}:")
  107. result_dict_group = {}
  108. for file in filelist:
  109. dataset = self.build_dataset(file)
  110. dataloader = build_data_loader(
  111. dataset,
  112. micro_batch_size=self.config.micro_batch_size,
  113. num_workers=1,
  114. drop_last=False,
  115. collate_fn=dataset.collate_fn if dataset.has_collate_fn else None,
  116. )
  117. prediction = []
  118. with torch.no_grad():
  119. for batch in tqdm(dataloader):
  120. prediction.append(self.predict_single_batch(batch))
  121. prediction = gather_result(prediction, len(dataset), self.config.micro_batch_size)
  122. result_dict = {key: metric(prediction, dataset.data) for key, metric in self.metrics.items()}
  123. result_dict_group[file] = (result_dict, len(dataset))
  124. if torch.distributed.get_rank() == 0 and self.save_prediction:
  125. self.save_prediction_to_file(file, prediction, dataset.data)
  126. if self.verbose:
  127. self.report_single_metrics(file, result_dict)
  128. result_dict_all[group_name] = result_dict_group
  129. print_rank_0(f"Evaluation results of task {self.config.name}:")
  130. if self.verbose:
  131. for group_name, result_dict_group in result_dict_all.items():
  132. self.report_group_metrics(group_name, result_dict_group)
  133. self.report_overall_metrics(
  134. {k: v for result_dict_group in result_dict_all.values() for k, v in result_dict_group.items()},
  135. )
  136. print_rank_0(f"Finish task {self.config.name} in {time.time() - start:.1f}s.")
  137. def save_prediction_to_file(self, file, predictions, data):
  138. file_name = file.split(".")[0]
  139. out_file = os.path.join("outputs", self.config.name + "_" + datetime.now().strftime("%m-%d-%H-%M_") + f"{file_name}.jsonl")
  140. print(f"Writing results to {out_file}...")
  141. os.makedirs(os.path.dirname(out_file), exist_ok=True)
  142. out_file = os.path.expanduser(out_file)
  143. with open(out_file, 'w') as fp:
  144. for i, sample in enumerate(tqdm(data)):
  145. task_id = sample["task_id"]
  146. result = self.evaluator.results[task_id].pop(0)
  147. sample["result"] = result[1]["result"]
  148. sample["passed"] = result[1]["passed"]
  149. sample["completion"] = self.tokenizer.tokenizer.decode(predictions[i])
  150. if "text" in sample:
  151. sample.pop("text")
  152. fp.write(json.dumps(sample) + '\n')