[GenAI] Add generateEmbedding API to CausalLMPipeline (#7227)

* add embedding * add frompretrain api to phi3 model * fix bug * Update CausalLMPipeline.cs
2024-08-30 10:30:46 -07:00 · 2024-08-30 10:30:46 -07:00 · 7c937bf81a
--- a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/AutoGenSample.cs
@ -9,6 +9,7 @@ using static TorchSharp.torch;
 using TorchSharp;
 using Microsoft.ML.GenAI.Core;
 using Microsoft.ML.GenAI.Core.Extension;
+using Microsoft.ML.Tokenizers;

 namespace Microsoft.ML.GenAI.Samples.Phi3Mini;

@ -26,12 +27,15 @@ public class AutoGenSample
        torch.manual_seed(1);
        torch.set_default_dtype(defaultType);
        var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct";
-        var pipeline = Utils.LoadPhi3Mini4KFromFolder(weightFolder, device: device, quantizeToInt8: false);
+        var tokenizerPath = Path.Combine(weightFolder, "tokenizer.model");
+        var tokenizer = Phi3TokenizerHelper.FromPretrained(tokenizerPath);
+        var model = Phi3ForCasualLM.FromPretrained(weightFolder, "config.json", layersOnTargetDevice: -1, quantizeToInt8: true);
+        var pipeline = new CausalLMPipeline<LlamaTokenizer, Phi3ForCasualLM>(tokenizer, model, device);
+        var question = @"write a C# program to calculate the factorial of a number";

        // agent
        var agent = new Phi3Agent(pipeline, "assistant")
            .RegisterPrintMessage();
-        var question = @"write a C# program to calculate the factorial of a number";

        // chat with the assistant
        await agent.SendAsync(question);
--- a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/SemanticKernelSample.cs
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/SemanticKernelSample.cs
@ -1,4 +1,7 @@
-using Microsoft.ML.GenAI.Phi.Extension;
+using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.GenAI.Phi;
+using Microsoft.ML.GenAI.Phi.Extension;
+using Microsoft.ML.Tokenizers;
 using Microsoft.SemanticKernel;
 using Microsoft.SemanticKernel.ChatCompletion;
 using TorchSharp;
@ -20,8 +23,10 @@ public class SemanticKernelSample
        torch.manual_seed(1);
        torch.set_default_dtype(defaultType);
        var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct";
-        var pipeline = Utils.LoadPhi3Mini4KFromFolder(weightFolder, device: device);
-
+        var tokenizerPath = Path.Combine(weightFolder, "tokenizer.model");
+        var tokenizer = Phi3TokenizerHelper.FromPretrained(tokenizerPath);
+        var model = Phi3ForCasualLM.FromPretrained(weightFolder, "config.json", layersOnTargetDevice: -1, quantizeToInt8: true);
+        var pipeline = new CausalLMPipeline<LlamaTokenizer, Phi3ForCasualLM>(tokenizer, model, device);

        var kernel = Kernel.CreateBuilder()
            .AddGenAIChatCompletion(pipeline)
@ -49,8 +54,10 @@ public class SemanticKernelSample
        torch.manual_seed(1);
        torch.set_default_dtype(defaultType);
        var weightFolder = @"C:\Users\xiaoyuz\source\repos\Phi-3-mini-4k-instruct";
-        var pipeline = Utils.LoadPhi3Mini4KFromFolder(weightFolder, device);
-
+        var tokenizerPath = Path.Combine(weightFolder, "tokenizer.model");
+        var tokenizer = Phi3TokenizerHelper.FromPretrained(tokenizerPath);
+        var model = Phi3ForCasualLM.FromPretrained(weightFolder, "config.json", layersOnTargetDevice: -1, quantizeToInt8: true);
+        var pipeline = new CausalLMPipeline<LlamaTokenizer, Phi3ForCasualLM>(tokenizer, model, device);

        var kernel = Kernel.CreateBuilder()
            .AddGenAITextGeneration(pipeline)
--- a/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/Utils.cs
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Phi3Mini/Utils.cs
@ -1,103 +0,0 @@
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
-using Microsoft.ML.GenAI.Core;
-using Microsoft.ML.GenAI.Phi;
-using Tensorboard;
-using static TorchSharp.torch;
-using TorchSharp;
-using Microsoft.ML.GenAI.Core.Extension;
-using System.Text.Json;
-using Microsoft.ML.Tokenizers;
-
-namespace Microsoft.ML.GenAI.Samples.Phi3Mini;
-
-internal static class Utils
-{
-    public static ICausalLMPipeline<Tokenizer, Phi3ForCasualLM> LoadPhi3Mini4KFromFolder(
-        string weightFolder,
-        string configName = "config.json",
-        string device = "cuda",
-        int modelSizeOnCudaInGB = 55,
-        int modelSizeOnMemoryInGB = 64,
-        int modelSizeOnDiskInGB = 200,
-        bool quantizeToInt8 = false,
-        bool quantizeToInt4 = false)
-    {
-        Console.WriteLine("Loading Phi3 from huggingface model weight folder");
-        torch.set_default_device("meta");
-        var configPath = System.IO.Path.Combine(weightFolder, configName);
-        var config = JsonSerializer.Deserialize<Phi3Config>(System.IO.File.ReadAllText(configPath)) ?? throw new ArgumentNullException(nameof(configPath));
-        var timer = System.Diagnostics.Stopwatch.StartNew();
-        var model = new Phi3ForCasualLM(config);
-        var tokenzierPath = System.IO.Path.Combine(weightFolder, "tokenizer.model");
-        var tokenizer = Phi3TokenizerHelper.FromPretrained(tokenzierPath);
-
-        if (quantizeToInt8)
-        {
-            model.ToInt8QuantizeModule();
-        }
-        else if (quantizeToInt4)
-        {
-            model.ToInt4QuantizeModule();
-        }
-
-        var deviceSizeMap = new Dictionary<string, long>
-        {
-            ["cuda"] = modelSizeOnCudaInGB * 1L * 1024 * 1024 * 1024,
-            ["cpu"] = modelSizeOnMemoryInGB * 1L * 1024 * 1024 * 1024,
-            ["disk"] = modelSizeOnDiskInGB * 1L * 1024 * 1024 * 1024,
-        };
-
-        var deviceMap = model.InferDeviceMapForEachLayer(
-            devices: ["cuda", "cpu", "disk"],
-            deviceSizeMapInByte: deviceSizeMap);
-
-        var deviceMapJson = JsonSerializer.Serialize(deviceMap, new JsonSerializerOptions { WriteIndented = true });
-        Console.WriteLine($"Device map:");
-        Console.WriteLine(deviceMapJson);
-
-        // load weight
-        torch.set_default_device("cpu");
-
-        Console.WriteLine("Start loading");
-        timer = System.Diagnostics.Stopwatch.StartNew();
-        model = new Phi3ForCasualLM(config);
-        timer.Stop();
-        Console.WriteLine($"Phi3 model created in {timer.ElapsedMilliseconds / 1000} s");
-
-        timer = System.Diagnostics.Stopwatch.StartNew();
-        model.LoadSafeTensors(weightFolder);
-        timer.Stop();
-        Console.WriteLine($"Phi3 weight loaded in {timer.ElapsedMilliseconds / 1000} s");
-
-        if (quantizeToInt8 || quantizeToInt4)
-        {
-            timer = System.Diagnostics.Stopwatch.StartNew();
-            Console.WriteLine("Start quantizing if needed");
-            if (quantizeToInt8)
-            {
-                model.ToInt8QuantizeModule();
-            }
-            else if (quantizeToInt4)
-            {
-                model.ToInt4QuantizeModule();
-            }
-            Console.WriteLine("Quantizing done");
-            timer.Stop();
-            Console.WriteLine($"Quantizing done in {timer.ElapsedMilliseconds / 1000} s");
-        }
-
-        timer = System.Diagnostics.Stopwatch.StartNew();
-        Console.WriteLine($"Start loading to device: {device}");
-        model = model.ToDynamicLoadingModel(deviceMap, "cuda");
-        timer.Stop();
-        Console.WriteLine($"Phi3 loaded to device: {device} in {timer.ElapsedMilliseconds / 1000} s");
-        var pipeline = new CausalLMPipeline<Tokenizer, Phi3ForCasualLM>(tokenizer, model, device);
-        torch.set_default_device(device);
-
-        return pipeline;
-    }
-}
--- a/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs
+++ b/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs
@ -1,4 +1,4 @@
 // See https://aka.ms/new-console-template for more information
 using Microsoft.ML.GenAI.Samples.Phi3Mini;

-await SemanticKernelSample.RunChatCompletionSample();
+await AutoGenSample.RunAsync();
--- a/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs
+++ b/src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs
@ -32,6 +32,11 @@ public interface ICausalLMPipeline
        float topP = CausalLMPipeline.Defaults.TopP,
        string[]? stopSequences = CausalLMPipeline.Defaults.StopSequence);

+    /// <summary>
+    /// Generate the embedding(last hidden state of the last token) for the prompt. The embedding is normalized by L2 norm.
+    /// </summary>
+    float[] GenerateEmbeddingFromLastTokenPool(string prompt);
+
    IEnumerable<string> GenerateStreaming(
        string prompt,
        int maxLen = CausalLMPipeline.Defaults.MaxLen,
@ -281,4 +286,23 @@ public class CausalLMPipeline : ICausalLMPipeline
        nextToken = torch.gather(probsIndex, dim: -1, index: nextToken);
        return nextToken;
    }
+
+    public float[] GenerateEmbeddingFromLastTokenPool(string prompt)
+    {
+        using var scope = NewDisposeScope();
+        using var noGrad = torch.no_grad();
+        var inputIds = this.Tokenizer.EncodeToIds(prompt);
+        var inputTensor = torch.tensor(inputIds.ToArray(), dtype: ScalarType.Int64, device: this.Device).unsqueeze(0);
+        var attentionMask = torch.ones_like(inputTensor, device: this.Device);
+        var input = new CausalLMModelInput(inputTensor, attentionMask, pastKeyValuesLength: 0);
+        var output = this.Model.forward(input);
+        var lastTokenHiddenState = output.LastHiddenState[0, ^1];
+
+        // shape of lastTokenHiddenState: [hidden_size]
+        // L2 norm
+        var norm = lastTokenHiddenState.norm();
+        var normalized = lastTokenHiddenState / norm;
+
+        return normalized.to_type(ScalarType.Float32).data<float>().ToArray();
+    }
 }
--- a/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3ForCasualLM.cs
+++ b/src/Microsoft.ML.GenAI.Phi/Phi3/Phi3ForCasualLM.cs
@ -9,6 +9,7 @@ using System.Text;
 using System.Text.Json;
 using System.Threading.Tasks;
 using Microsoft.ML.GenAI.Core;
+using Microsoft.ML.GenAI.Core.Extension;
 using Microsoft.ML.GenAI.Phi.Module;
 using TorchSharp;
 using TorchSharp.Modules;
@ -66,6 +67,55 @@ public class Phi3ForCasualLM : nn.Module<CausalLMModelInput, CausalLMModelOutput
        return phi;
    }

+    public static Phi3ForCasualLM FromPretrained(
+        string modelFolder,
+        string configName = "config.json",
+        string checkPointName = "model.safetensors.index.json",
+        bool quantizeToInt8 = false,
+        bool quantizeToInt4 = false,
+        int layersOnTargetDevice = -1,
+        ScalarType torchDtype = ScalarType.BFloat16,
+        string targetDevice = "cuda")
+    {
+        if (layersOnTargetDevice == -1 && quantizeToInt4 == false && quantizeToInt8 == false)
+        {
+            return FromPretrained(modelFolder, configName, checkPointName, torchDtype, targetDevice);
+        }
+
+        var originalDefaultDevice = torch.get_default_device();
+        torch.set_default_device("meta");
+        var config = Path.Join(modelFolder, configName);
+        var modelConfig = JsonSerializer.Deserialize<Phi3Config>(File.ReadAllText(config)) ?? throw new ArgumentNullException(nameof(config));
+        modelConfig.DType = torchDtype;
+        var model = new Phi3ForCasualLM(modelConfig);
+
+        if (quantizeToInt8)
+        {
+            model.ToInt8QuantizeModule();
+        }
+        else if (quantizeToInt4)
+        {
+            model.ToInt4QuantizeModule();
+        }
+
+        var deviceMap = model.InferDeviceMapForEachLayer(
+            [
+                KeyValuePair.Create(targetDevice, layersOnTargetDevice),
+                KeyValuePair.Create("cpu", -1)
+            ]);
+
+        torch.set_default_device("cpu");
+        model = new Phi3ForCasualLM(modelConfig);
+
+        model.LoadSafeTensors(modelFolder, checkPointName);
+
+        model = model.ToDynamicLoadingModel(deviceMap, targetDevice);
+
+        torch.set_default_device(originalDefaultDevice);
+
+        return model;
+    }
+
    public void LoadSafeTensors(string modelFolder, string checkPointName = "model.safetensors.index.json")
    {
        this.load_checkpoint(path: modelFolder, checkpointName: checkPointName, strict: false, useTqdm: false);