|
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML.Data;
using Microsoft.ML.Runtime;
using Microsoft.ML.Trainers;
namespace Microsoft.ML
{
public static class RecommenderCatalog
{
/// <summary>
/// Trainers and tasks specific to recommendation problems.
/// </summary>
public static RecommendationCatalog Recommendation(this MLContext ctx) => new RecommendationCatalog(ctx);
}
/// <summary>
/// The central catalog for recommendation trainers and tasks.
/// </summary>
public sealed class RecommendationCatalog : TrainCatalogBase
{
/// <summary>
/// The list of trainers for performing recommendation.
/// </summary>
public RecommendationTrainers Trainers { get; }
internal RecommendationCatalog(IHostEnvironment env)
: base(env, nameof(RecommendationCatalog))
{
Trainers = new RecommendationTrainers(this);
}
public sealed class RecommendationTrainers : CatalogInstantiatorBase
{
internal RecommendationTrainers(RecommendationCatalog catalog)
: base(catalog)
{
}
/// <summary>
/// Create <see cref="MatrixFactorizationTrainer"/>, which predicts element values in a matrix using matrix factorization.
/// </summary>
/// <remarks>
/// <para>The basic idea of matrix factorization is finding two low-rank factor matrices to approximate the training matrix.</para>
/// <para>In this module, the expected training data is a list of tuples. Every tuple consists of a column index, a row index,
/// and the value at the location specified by the two indexes.
/// </para>
/// </remarks>
/// <param name="labelColumnName">The name of the label column. The column data must be <see cref="System.Single"/>.</param>
/// <param name="matrixColumnIndexColumnName">The name of the column hosting the matrix's column IDs.
/// The column data must be <see cref="Microsoft.ML.Data.KeyDataViewType"/>.</param>
/// <param name="matrixRowIndexColumnName">The name of the column hosting the matrix's row IDs.
/// The column data must be <see cref="Microsoft.ML.Data.KeyDataViewType"/>.</param>
/// <param name="approximationRank">Rank of approximation matrices.</param>
/// <param name="learningRate">Initial learning rate. It specifies the speed of the training algorithm.</param>
/// <param name="numberOfIterations">Number of training iterations.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[MatrixFactorization](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorization.cs)]
/// ]]></format>
/// </example>
public MatrixFactorizationTrainer MatrixFactorization(
string labelColumnName,
string matrixColumnIndexColumnName,
string matrixRowIndexColumnName,
int approximationRank = MatrixFactorizationTrainer.Defaults.ApproximationRank,
double learningRate = MatrixFactorizationTrainer.Defaults.LearningRate,
int numberOfIterations = MatrixFactorizationTrainer.Defaults.NumIterations)
=> new MatrixFactorizationTrainer(Owner.GetEnvironment(), labelColumnName, matrixColumnIndexColumnName, matrixRowIndexColumnName,
approximationRank, learningRate, numberOfIterations);
/// <summary>
/// Create <see cref="MatrixFactorizationTrainer"/> with advanced options, which predicts element values in a matrix using matrix factorization.
/// </summary>
/// <remarks>
/// <para>The basic idea of matrix factorization is finding two low-rank factor matrices to approximate the training matrix.</para>
/// <para>In this module, the expected training data is a list of tuples. Every tuple consists of a column index, a row index,
/// and the value at the location specified by the two indexes. The training configuration is encoded in <see cref="MatrixFactorizationTrainer.Options"/>.
/// To invoke one-class matrix factorization, user needs to specify <see cref="MatrixFactorizationTrainer.LossFunctionType.SquareLossOneClass"/>.
/// The default setting <see cref="MatrixFactorizationTrainer.LossFunctionType.SquareLossRegression"/> is for standard matrix factorization problem.
/// </para>
/// </remarks>
/// <param name="options">Trainer options.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[MatrixFactorization](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorizationWithOptions.cs)]
/// [!code-csharp[MatrixFactorization](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/OneClassMatrixFactorizationWithOptions.cs)]
/// ]]></format>
/// </example>
public MatrixFactorizationTrainer MatrixFactorization(
MatrixFactorizationTrainer.Options options)
=> new MatrixFactorizationTrainer(Owner.GetEnvironment(), options);
}
/// <summary>
/// Evaluates the scored recommendation data.
/// </summary>
/// <param name="data">The scored data.</param>
/// <param name="labelColumnName">The name of the label column in <paramref name="data"/>.</param>
/// <param name="scoreColumnName">The name of the score column in <paramref name="data"/>.</param>
/// <returns>The evaluation results for these calibrated outputs.</returns>
public RegressionMetrics Evaluate(IDataView data, string labelColumnName = DefaultColumnNames.Label, string scoreColumnName = DefaultColumnNames.Score)
{
Environment.CheckValue(data, nameof(data));
Environment.CheckNonEmpty(labelColumnName, nameof(labelColumnName));
Environment.CheckNonEmpty(scoreColumnName, nameof(scoreColumnName));
var eval = new RegressionEvaluator(Environment, new RegressionEvaluator.Arguments() { });
return eval.Evaluate(data, labelColumnName, scoreColumnName);
}
/// <summary>
/// Run cross-validation over <paramref name="numberOfFolds"/> folds of <paramref name="data"/>, by fitting <paramref name="estimator"/>,
/// and respecting <paramref name="samplingKeyColumnName"/> if provided.
/// Then evaluate each sub-model against <paramref name="labelColumnName"/> and return metrics.
/// </summary>
/// <param name="data">The data to run cross-validation on.</param>
/// <param name="estimator">The estimator to fit.</param>
/// <param name="numberOfFolds">Number of cross-validation folds.</param>
/// <param name="labelColumnName">The label column (for evaluation).</param>
/// <param name="samplingKeyColumnName">Optional name of the column to use as a stratification column. If two examples share the same value of the <paramref name="samplingKeyColumnName"/>
/// (if provided), they are guaranteed to appear in the same subset (train or test). Use this to make sure there is no label leakage from train to the test set.
/// If this optional parameter is not provided, a stratification columns will be generated, and its values will be random numbers .</param>
/// <param name="seed">Optional parameter used in combination with the <paramref name="samplingKeyColumnName"/>.
/// If the <paramref name="samplingKeyColumnName"/> is not provided, the random numbers generated to create it, will use this seed as value.
/// And if it is not provided, the default value will be used.</param>
/// <returns>Per-fold results: metrics, models, scored datasets.</returns>
public IReadOnlyList<CrossValidationResult<RegressionMetrics>> CrossValidate(
IDataView data, IEstimator<ITransformer> estimator, int numberOfFolds = 5, string labelColumnName = DefaultColumnNames.Label,
string samplingKeyColumnName = null, int? seed = null)
{
Environment.CheckNonEmpty(labelColumnName, nameof(labelColumnName));
var result = CrossValidateTrain(data, estimator, numberOfFolds, samplingKeyColumnName, seed);
return result.Select(x => new CrossValidationResult<RegressionMetrics>(x.Model, Evaluate(x.Scores, labelColumnName), x.Scores, x.Fold)).ToArray();
}
}
}
|