2 years ago · 86993cbd00
--- a/ggml/examples/unity/fairseq2.cpp
+++ b/ggml/examples/unity/fairseq2.cpp
@@ -502,7 +502,7 @@ int _determine_max_seq_len(const SequenceGeneratorJob& job, int source_seq_len)
 
				     if (source_seq_len <= 0 || opts.soft_max_seq_len_a <= 0) {
			
 
				         max_seq_len = opts.hard_max_seq_len;
			
 
				     } else {
			
 
				-        max_seq_len = std::min(opts.hard_max_seq_len, int(opts.soft_max_seq_len_a * source_seq_len + opts.soft_max_seq_len_b));
			
 
				+        max_seq_len = std::min(opts.hard_max_seq_len, int(opts.soft_max_seq_len_a * source_seq_len) + opts.soft_max_seq_len_b);
			
 
				     }
			
 
				 
			
 
				     if (opts.min_seq_len > max_seq_len) {
			
@@ -839,14 +839,16 @@ extern "C" float generate_sequence(
 
				             encoder_output,
			
 
				             encoder_padding_mask
			
 
				             // state_bag=state_bag,
			
 
				-        );
			
 
				+        ); // (B, S, D)
			
 
				 
			
 
				         // state_bag.increment_step()
			
 
				 
			
 
				         // Because of no IncrementalStateBag decoder_output here is of shape (B, S, D)
			
 
				         // Just look at the last token.
			
 
				         decoder_output = ggml_slice(ctx, decoder_output, 1, step_nr, step_nr+1);
			
 
				-        ggml_tensor* logits = Linear_forward(model, "final_proj", decoder_output);
			
 
				+        decoder_output = ggml_cont(ctx, decoder_output);
			
 
				+        decoder_output = ggml_flatten_1d(ctx, decoder_output, 0);  // (B, model_dim)
			
 
				+        ggml_tensor* logits = Linear_forward(model, "final_proj", decoder_output);  // (B, vocab_size)
			
 
				         ggml_tensor* lprobs = ggml_log_softmax(ctx, logits);
			
 
				 
			
 
				         // Compute lprobs here so we can modify it in place in the lprob tweaking phase
			
--- a/ggml/examples/unity/fairseq2.h
+++ b/ggml/examples/unity/fairseq2.h
@@ -114,7 +114,7 @@ struct SequenceGeneratorOptions {
 
				     /// sequence length. The generated sequences (including prefix sequence) will
			
 
				     /// have the maximum length of ``min(hard_max_seq_len, ax + b)``. See also
			
 
				     /// ``hard_max_seq_len``.
			
 
				-    int soft_max_seq_len_a = 1;
			
 
				+    float soft_max_seq_len_a = 1;
			
 
				     int soft_max_seq_len_b = 200;
			
 
				 
			
 
				     /// The hard limit on maximum length of generated sequences.
			
--- a/ggml/ggml.py
+++ b/ggml/ggml.py
@@ -406,7 +406,7 @@ def ggml_unflatten_1d(
 
				 class SequenceGeneratorOptions:
			
 
				     beam_size: int
			
 
				     min_seq_len: int
			
 
				-    soft_max_seq_len_a: int
			
 
				+    soft_max_seq_len_a: float
			
 
				     soft_max_seq_len_b: int
			
 
				     hard_max_seq_len: int
			
 
				     len_penalty: float
			
--- a/ggml/test_unity_cpp.py
+++ b/ggml/test_unity_cpp.py
@@ -224,16 +224,16 @@ def test_MultiheadAttention_forward(ctx: Ctx, g_model: c_void_p, pt_model: Any)
 
				 def test_StandardTransformerEncoderLayer_forward(
			
 
				     ctx: Ctx, g_model: c_void_p, pt_model: Any
			
 
				 ) -> None:
			
 
				-    x = torch.empty((1, 21, 1024))
			
 
				-    padding_mask = torch.ones((1, 21))
			
 
				+    x = torch.empty((2, 21, 1024))
			
 
				+    padding_mask = torch.ones((2, 21))
			
 
				     torch.random.manual_seed(0)
			
 
				     torch.nn.init.uniform_(x, -1, 1)
			
 
				 
			
 
				     layer = pt_model.text_encoder.layers[0]
			
 
				 
			
 
				-    gx = ggml.from_numpy(ctx, x[0])
			
 
				+    gx = ggml.from_numpy(ctx, x)
			
 
				     ggml.ggml_set_name(gx, b"x")
			
 
				-    gpad = ggml.from_numpy(ctx, padding_mask[0])
			
 
				+    gpad = ggml.from_numpy(ctx, padding_mask)
			
 
				     ggml.ggml_set_name(gpad, b"padding_mask")
			
 
				     gy = ggml.forward(
			
 
				         "StandardTransformerEncoderLayer",
			
@@ -248,7 +248,7 @@ def test_StandardTransformerEncoderLayer_forward(
 
				     y = ggml.to_numpy(gy)
			
 
				 
			
 
				     y_exp, _ = layer(x, padding_mask)
			
 
				-    y_exp = y_exp.squeeze(0).numpy()  # remove batch dimension
			
 
				+    y_exp = y_exp.numpy()
			
 
				 
			
 
				     assert y.shape == y_exp.shape
			
 
				     assert np.allclose(y_exp, y, atol=1e-4 if UNITY_FLASH_ATTN else 1e-2)
			
@@ -257,14 +257,14 @@ def test_StandardTransformerEncoderLayer_forward(
 
				 def test_StandardTransformerEncoder_forward(
			
 
				     ctx: Ctx, g_model: c_void_p, pt_model: Any
			
 
				 ) -> None:
			
 
				-    x = torch.empty((1, 21, 1024))
			
 
				-    padding_mask = torch.ones((1, 21))
			
 
				+    x = torch.empty((2, 21, 1024))
			
 
				+    padding_mask = torch.ones((2, 21))
			
 
				     torch.random.manual_seed(0)
			
 
				     torch.nn.init.uniform_(x, -1, 1)
			
 
				 
			
 
				-    gx = ggml.from_numpy(ctx, x[0])
			
 
				+    gx = ggml.from_numpy(ctx, x)
			
 
				     ggml.ggml_set_name(gx, b"x")
			
 
				-    gpad = ggml.from_numpy(ctx, padding_mask[0])
			
 
				+    gpad = ggml.from_numpy(ctx, padding_mask)
			
 
				     ggml.ggml_set_name(gpad, b"padding_mask")
			
 
				     gy = ggml.forward(
			
 
				         "StandardTransformerEncoder",
			
@@ -279,7 +279,7 @@ def test_StandardTransformerEncoder_forward(
 
				     y = ggml.to_numpy(gy)
			
 
				 
			
 
				     y_exp, _ = pt_model.text_encoder(x, padding_mask)
			
 
				-    y_exp = y_exp.squeeze(0).numpy()  # remove batch dimension
			
 
				+    y_exp = y_exp.numpy()
			
 
				 
			
 
				     assert y.shape == y_exp.shape
			
 
				     assert np.allclose(y_exp, y, atol=1e-4 if UNITY_FLASH_ATTN else 1e-2)