Forráskód Böngészése

fix StandardTransformerEncoder

Guillaume Wenzek 1 éve
szülő
commit
86993cbd00

+ 5 - 3
ggml/examples/unity/fairseq2.cpp

@@ -502,7 +502,7 @@ int _determine_max_seq_len(const SequenceGeneratorJob& job, int source_seq_len)
     if (source_seq_len <= 0 || opts.soft_max_seq_len_a <= 0) {
         max_seq_len = opts.hard_max_seq_len;
     } else {
-        max_seq_len = std::min(opts.hard_max_seq_len, int(opts.soft_max_seq_len_a * source_seq_len + opts.soft_max_seq_len_b));
+        max_seq_len = std::min(opts.hard_max_seq_len, int(opts.soft_max_seq_len_a * source_seq_len) + opts.soft_max_seq_len_b);
     }
 
     if (opts.min_seq_len > max_seq_len) {
@@ -839,14 +839,16 @@ extern "C" float generate_sequence(
             encoder_output,
             encoder_padding_mask
             // state_bag=state_bag,
-        );
+        ); // (B, S, D)
 
         // state_bag.increment_step()
 
         // Because of no IncrementalStateBag decoder_output here is of shape (B, S, D)
         // Just look at the last token.
         decoder_output = ggml_slice(ctx, decoder_output, 1, step_nr, step_nr+1);
-        ggml_tensor* logits = Linear_forward(model, "final_proj", decoder_output);
+        decoder_output = ggml_cont(ctx, decoder_output);
+        decoder_output = ggml_flatten_1d(ctx, decoder_output, 0);  // (B, model_dim)
+        ggml_tensor* logits = Linear_forward(model, "final_proj", decoder_output);  // (B, vocab_size)
         ggml_tensor* lprobs = ggml_log_softmax(ctx, logits);
 
         // Compute lprobs here so we can modify it in place in the lprob tweaking phase

+ 1 - 1
ggml/examples/unity/fairseq2.h

@@ -114,7 +114,7 @@ struct SequenceGeneratorOptions {
     /// sequence length. The generated sequences (including prefix sequence) will
     /// have the maximum length of ``min(hard_max_seq_len, ax + b)``. See also
     /// ``hard_max_seq_len``.
-    int soft_max_seq_len_a = 1;
+    float soft_max_seq_len_a = 1;
     int soft_max_seq_len_b = 200;
 
     /// The hard limit on maximum length of generated sequences.

+ 1 - 1
ggml/ggml.py

@@ -406,7 +406,7 @@ def ggml_unflatten_1d(
 class SequenceGeneratorOptions:
     beam_size: int
     min_seq_len: int
-    soft_max_seq_len_a: int
+    soft_max_seq_len_a: float
     soft_max_seq_len_b: int
     hard_max_seq_len: int
     len_penalty: float

+ 10 - 10
ggml/test_unity_cpp.py

@@ -224,16 +224,16 @@ def test_MultiheadAttention_forward(ctx: Ctx, g_model: c_void_p, pt_model: Any)
 def test_StandardTransformerEncoderLayer_forward(
     ctx: Ctx, g_model: c_void_p, pt_model: Any
 ) -> None:
-    x = torch.empty((1, 21, 1024))
-    padding_mask = torch.ones((1, 21))
+    x = torch.empty((2, 21, 1024))
+    padding_mask = torch.ones((2, 21))
     torch.random.manual_seed(0)
     torch.nn.init.uniform_(x, -1, 1)
 
     layer = pt_model.text_encoder.layers[0]
 
-    gx = ggml.from_numpy(ctx, x[0])
+    gx = ggml.from_numpy(ctx, x)
     ggml.ggml_set_name(gx, b"x")
-    gpad = ggml.from_numpy(ctx, padding_mask[0])
+    gpad = ggml.from_numpy(ctx, padding_mask)
     ggml.ggml_set_name(gpad, b"padding_mask")
     gy = ggml.forward(
         "StandardTransformerEncoderLayer",
@@ -248,7 +248,7 @@ def test_StandardTransformerEncoderLayer_forward(
     y = ggml.to_numpy(gy)
 
     y_exp, _ = layer(x, padding_mask)
-    y_exp = y_exp.squeeze(0).numpy()  # remove batch dimension
+    y_exp = y_exp.numpy()
 
     assert y.shape == y_exp.shape
     assert np.allclose(y_exp, y, atol=1e-4 if UNITY_FLASH_ATTN else 1e-2)
@@ -257,14 +257,14 @@ def test_StandardTransformerEncoderLayer_forward(
 def test_StandardTransformerEncoder_forward(
     ctx: Ctx, g_model: c_void_p, pt_model: Any
 ) -> None:
-    x = torch.empty((1, 21, 1024))
-    padding_mask = torch.ones((1, 21))
+    x = torch.empty((2, 21, 1024))
+    padding_mask = torch.ones((2, 21))
     torch.random.manual_seed(0)
     torch.nn.init.uniform_(x, -1, 1)
 
-    gx = ggml.from_numpy(ctx, x[0])
+    gx = ggml.from_numpy(ctx, x)
     ggml.ggml_set_name(gx, b"x")
-    gpad = ggml.from_numpy(ctx, padding_mask[0])
+    gpad = ggml.from_numpy(ctx, padding_mask)
     ggml.ggml_set_name(gpad, b"padding_mask")
     gy = ggml.forward(
         "StandardTransformerEncoder",
@@ -279,7 +279,7 @@ def test_StandardTransformerEncoder_forward(
     y = ggml.to_numpy(gy)
 
     y_exp, _ = pt_model.text_encoder(x, padding_mask)
-    y_exp = y_exp.squeeze(0).numpy()  # remove batch dimension
+    y_exp = y_exp.numpy()
 
     assert y.shape == y_exp.shape
     assert np.allclose(y_exp, y, atol=1e-4 if UNITY_FLASH_ATTN else 1e-2)