File: Dynamic\NgramExtraction.cs
Web Access
Project: src\docs\samples\Microsoft.ML.Samples\Microsoft.ML.Samples.csproj (Microsoft.ML.Samples)
using System;
using System.Collections.Generic;
using Microsoft.ML;
using Microsoft.ML.Data;
 
namespace Samples.Dynamic
{
    public static partial class TransformSamples
    {
        public static void Example()
        {
            // Create a new ML context, for ML.NET operations. It can be used for
            // exception tracking and logging, as well as the source of randomness.
            var ml = new MLContext();
 
            // Get a small dataset as an IEnumerable and convert to IDataView.
            var data = new List<SampleSentimentData>() {
                new SampleSentimentData { Sentiment = true,
                    SentimentText = "Best game I've ever played." },
 
                new SampleSentimentData { Sentiment = false,
                    SentimentText = "==RUDE== Dude, 2" },
 
                new SampleSentimentData { Sentiment = true,
                    SentimentText = "Until the next game," +
                    "this is the best Xbox game!" } };
 
            // Convert IEnumerable to IDataView.
            var trainData = ml.Data.LoadFromEnumerable(data);
 
            // Preview of the data.
            //
            // Sentiment    SentimentText
            // true         Best game I've ever played.
            // false        ==RUDE== Dude, 2.
            // true          Until the next game, this is the best Xbox game!
 
            // A pipeline to tokenize text as characters and then combine them
            // together into n-grams. The pipeline uses the default settings to
            // featurize.
 
            var charsPipeline = ml.Transforms.Text
                .TokenizeIntoCharactersAsKeys("Chars", "SentimentText",
                useMarkerCharacters: false);
 
            var ngramOnePipeline = ml.Transforms.Text
                .ProduceNgrams("CharsUnigrams", "Chars", ngramLength: 1);
 
            var ngramTwpPipeline = ml.Transforms.Text
                .ProduceNgrams("CharsTwograms", "Chars");
 
            var oneCharsPipeline = charsPipeline
                .Append(ngramOnePipeline);
 
            var twoCharsPipeline = charsPipeline
                .Append(ngramTwpPipeline);
 
            // The transformed data for pipelines.
            var transformedData_onechars = oneCharsPipeline.Fit(trainData)
                .Transform(trainData);
 
            var transformedData_twochars = twoCharsPipeline.Fit(trainData)
                .Transform(trainData);
 
            // Small helper to print the text inside the columns, in the console. 
            Action<string, IEnumerable<VBuffer<float>>,
                VBuffer<ReadOnlyMemory<char>>>
                printHelper = (columnName, column, names) =>
 
            {
                Console.WriteLine(
                    $"{columnName} column obtained post-transformation.");
 
                var slots = names.GetValues();
                foreach (var featureRow in column)
                {
                    foreach (var item in featureRow.Items())
                        Console.Write($"'{slots[item.Key]}' - {item.Value} ");
                    Console.WriteLine("");
                }
 
                Console.WriteLine(
                    "===================================================");
            };
            // Preview of the CharsUnigrams column obtained after processing the
            // input.
            VBuffer<ReadOnlyMemory<char>> slotNames = default;
            transformedData_onechars.Schema["CharsUnigrams"]
                .GetSlotNames(ref slotNames);
 
            var charsOneGramColumn = transformedData_onechars
                .GetColumn<VBuffer<float>>(transformedData_onechars
                .Schema["CharsUnigrams"]);
 
            printHelper("CharsUnigrams", charsOneGramColumn, slotNames);
 
            // CharsUnigrams column obtained post-transformation.
            // 'B' - 1 'e' - 6 's' - 1 't' - 1 '<?>' - 4 'g' - 1 'a' - 2 'm' - 1 'I' - 1 ''' - 1 'v' - 2 ...
            // 'e' - 1 '<?>' - 2 'd' - 1 '=' - 4 'R' - 1 'U' - 1 'D' - 2 'E' - 1 'u' - 1 ',' - 1 '2' - 1
            // 'B' - 0 'e' - 6 's' - 3 't' - 6 '<?>' - 9 'g' - 2 'a' - 2 'm' - 2 'I' - 0 ''' - 0 'v' - 0 ...
            // Preview of the CharsTwoGrams column obtained after processing the input.
            var charsTwoGramColumn = transformedData_twochars
                .GetColumn<VBuffer<float>>(transformedData_twochars
                .Schema["CharsTwograms"]);
 
            transformedData_twochars.Schema["CharsTwograms"]
                .GetSlotNames(ref slotNames);
 
            printHelper("CharsTwograms", charsTwoGramColumn, slotNames);
 
            // CharsTwograms column obtained post-transformation.
            // 'B' - 1 'B|e' - 1 'e' - 6 'e|s' - 1 's' - 1 's|t' - 1 't' - 1 't|<?>' - 1 '<?>' - 4 '<?>|g' - 1 ...
            // 'e' - 1 '<?>' - 2 'd' - 1 '=' - 4 '=|=' - 2 '=|R' - 1 'R' - 1 'R|U' - 1 'U' - 1 'U|D' - 1 'D' - 2 ...
            // 'B' - 0 'B|e' - 0 'e' - 6 'e|s' - 1 's' - 3 's|t' - 1 't' - 6 't|<?>' - 2 '<?>' - 9 '<?>|g' - 2 ...
        }
 
        /// <summary>
        /// A dataset that contains a tweet and the sentiment assigned to that
        /// tweet: 0 - negative and 1 - positive sentiment.
        /// </summary>
        public class SampleSentimentData
        {
            public bool Sentiment { get; set; }
            public string SentimentText { get; set; }
        }
    }
}