|
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Microsoft.ML.Internal.Utilities;
using Microsoft.ML.Runtime;
namespace Microsoft.ML.Trainers.FastTree
{
/// <summary>
/// A dataset of features.
/// </summary>
internal sealed class Dataset
{
private readonly DatasetSkeleton _datasetSkeleton;
private readonly FeatureFlockBase[] _flocks;
// Maps index of the flock, to index of the first feature of that flock.
private readonly int[] _flockToFirstFeature;
// Maps index of a feature, to the flock containing that feature. In combination with
// _flockToFirstFeature can easily recover the feature sub-index within the flock itself.
private readonly int[] _featureToFlock;
public UInt32[] DupeIds { get; private set; }
public enum DupeIdInfo
{
NoInformation = 0,
Unique = 1,
FormatNotSupported = 1000000,
Code404 = 1000001
};
public const int Version = 3;
/// <summary>
/// Initializes a new instance of the <see cref="Dataset"/> class.
/// </summary>
/// <param name="datasetSkeleton">The dataset skeleton corresponding to the features</param>
/// <param name="flocks">An array of feature flocks</param>
public Dataset(DatasetSkeleton datasetSkeleton, FeatureFlockBase[] flocks)
{
Contracts.AssertValue(datasetSkeleton);
Contracts.AssertValue(flocks);
Contracts.Assert(flocks.All(f => f.Examples == datasetSkeleton.NumDocs));
_datasetSkeleton = datasetSkeleton;
_maxDocsPerQuery = -1;
_flocks = flocks;
_flockToFirstFeature = new int[_flocks.Length];
if (_flocks.Length > 0)
{
for (int i = 1; i < _flocks.Length; ++i)
{
Contracts.AssertValue(_flocks[i - 1]);
_flockToFirstFeature[i] = _flockToFirstFeature[i - 1] + _flocks[i - 1].Count;
}
var lastFlock = _flocks[_flocks.Length - 1];
Contracts.AssertValue(lastFlock);
int numFeatures = _flockToFirstFeature[_flockToFirstFeature.Length - 1] + lastFlock.Count;
Contracts.Assert(numFeatures == _flocks.Sum(f => f.Count));
_featureToFlock = new int[numFeatures];
for (int flock = 0; flock < _flockToFirstFeature.Length; ++flock)
{
int min = _flockToFirstFeature[flock];
int lim = min + _flocks[flock].Count;
for (int feat = min; feat < lim; ++feat)
_featureToFlock[feat] = flock;
}
}
else
_featureToFlock = new int[0];
}
/// <summary>
/// Maps a global feature index, to the index of the particular flock, as well as the
/// index of the subfeature within that flock.
/// </summary>
/// <param name="feature">The index of the feature at the dataset level</param>
/// <param name="flock">The index of the flock containing this feature</param>
/// <param name="subfeature">The index of the feature within the flock</param>
public void MapFeatureToFlockAndSubFeature(int feature, out int flock, out int subfeature)
{
Contracts.Assert(0 <= feature && feature < NumFeatures);
flock = _featureToFlock[feature];
subfeature = feature - _flockToFirstFeature[flock];
Contracts.Assert(0 <= flock && flock < NumFlocks);
Contracts.Assert(0 <= subfeature && subfeature < _flocks[flock].Count);
}
/// <summary>
/// Given a flock index, returns the index of the first feature in this flock.
/// </summary>
/// <param name="flock">Index of the flock</param>
/// <returns>The index of the first feature that belongs to this flock</returns>
public int FlockToFirstFeature(int flock)
{
Contracts.Assert(0 <= flock && flock < NumFlocks);
return _flockToFirstFeature[flock];
}
#region Skeleton, skeleton passthroughs, and skeleton derived quantities
/// <summary>
/// Gets the dataset skeleton.
/// </summary>
/// <value>The skeleton.</value>
public DatasetSkeleton Skeleton => _datasetSkeleton;
/// <summary>
/// Gets the labels.
/// </summary>
/// <value>The labels.</value>
public short[] Ratings => _datasetSkeleton.Ratings;
public double[] Targets => _datasetSkeleton.ActualTargets;
/// <summary>
/// Gets the boundaries.
/// </summary>
/// <value>The boundaries.</value>
public int[] Boundaries => _datasetSkeleton.Boundaries;
/// <summary>
/// Gets the query ids.
/// </summary>
/// <value>The query ids.</value>
public ulong[] QueryIds => _datasetSkeleton.QueryIds;
/// <summary>
/// Gets the doc ids.
/// </summary>
/// <value>The doc ids.</value>
public ulong[] DocIds => _datasetSkeleton.DocIds;
/// <summary>
/// Gets the max DCG.
/// </summary>
/// <value>The max DCG.</value>
public double[][] MaxDcg => _datasetSkeleton.MaxDcg;
private int _maxDocsPerQuery;
/// <summary>
/// Gets the max number of docs per any query.
/// </summary>
/// <value>The max number of docs per any query.</value>
public int MaxDocsPerQuery
{
get
{
if (_maxDocsPerQuery < 0)
{
if (NumQueries == 0)
_maxDocsPerQuery = 0;
else
_maxDocsPerQuery = Enumerable.Range(0, NumQueries).Select(NumDocsInQuery).Max();
}
return _maxDocsPerQuery;
}
}
/// <summary>
/// Gets the number of docs in the entire dataset.
/// </summary>
/// <value>The number of docs in the entire dataset.</value>
public int NumDocs
{
get { return _datasetSkeleton.NumDocs; }
}
/// <summary>
/// Nums the docs in a given query.
/// </summary>
/// <param name="queryIndex">Index of the query.</param>
/// <returns>the number of docs in the query</returns>
public int NumDocsInQuery(int queryIndex)
{
return _datasetSkeleton.Boundaries[queryIndex + 1] - _datasetSkeleton.Boundaries[queryIndex];
}
/// <summary>
/// Gets the number of queries in the dataset.
/// </summary>
/// <value>The number of queries in the dataset.</value>
public int NumQueries
{
get { return _datasetSkeleton.NumQueries; }
}
/// <summary>
/// Returns the document to query
/// </summary>
/// <returns>The associated document</returns>
public int[] DocToQuery
{
get { return _datasetSkeleton.DocToQuery; }
}
/// <summary>
/// Returns the query weights object in underlying dataset skeleton
/// </summary>
public double[] SampleWeights
{
get { return _datasetSkeleton.SampleWeights; }
}
/// <summary>
/// Returns the number of bytes written by the member ToByteArray()
/// </summary>
public long SizeInBytes()
{
return _datasetSkeleton.SizeInBytes() + _flocks.Sum(x => (long)x.SizeInBytes());
}
#endregion
/// <summary>
/// Gets the array of features.
/// </summary>
/// <value>The array of features.</value>
public FeatureFlockBase[] Flocks
{
get { return _flocks; }
}
/// <summary>
/// The number of feature flocks.
/// </summary>
public int NumFlocks
{
get { return _flocks.Length; }
}
/// <summary>
/// The number of features.
/// </summary>
public int NumFeatures
{
get { return _featureToFlock.Length; }
}
public IIntArrayForwardIndexer GetIndexer(int feature)
{
Contracts.Assert(0 <= feature && feature < _featureToFlock.Length);
int flock;
int subfeature;
MapFeatureToFlockAndSubFeature(feature, out flock, out subfeature);
return _flocks[flock].GetIndexer(subfeature);
}
/// <summary>
/// Split a dataset by queries into disjoint parts
/// </summary>
/// <param name="fraction">an array of the fractional size of each part, must sum to 1.0</param>
/// <param name="randomSeed">a seed that deterministically defines the split</param>
/// <param name="destroyThisDataset">do you want the features of this dataset to be destroyed on-the-fly as the new datasets are created</param>
/// <returns></returns>
public Dataset[] Split(double[] fraction, int randomSeed, bool destroyThisDataset)
{
int numParts = fraction.Length;
int[][] assignment;
DatasetSkeleton[] datasetSkeletonPart = _datasetSkeleton.Split(fraction, randomSeed, out assignment);
FeatureFlockBase[][] featureParts = Utils.BuildArray(numParts, i => new FeatureFlockBase[NumFlocks]);
Parallel.For(0, NumFlocks, new ParallelOptions { MaxDegreeOfParallelism = BlockingThreadPool.NumThreads },
(int flockIndex) =>
{
SplitThreadWorker(featureParts, flockIndex, assignment, destroyThisDataset);
});
// create datasets
Dataset[] datasets = Enumerable.Range(0, numParts).Select(p => datasetSkeletonPart[p] == null ?
null : new Dataset(datasetSkeletonPart[p], featureParts[p])).ToArray(numParts);
// create and return the datasets
return datasets;
}
/// <summary>
/// Creates a new Dataset, which includes a subset of the docs in this Dataset.
/// </summary>
/// <param name="docIndices">A sorted array of doc indices</param>
/// <param name="destroyThisDataset">Determines if this Dataset is deleted on the fly as the
/// new one is created (this reduces peak memory)</param>
public Dataset GetSubDataset(int[] docIndices, bool destroyThisDataset)
{
#if !NO_STORE
return GetSubDataset(docIndices, destroyThisDataset, null);
}
public Dataset GetSubDataset(int[] docIndices, bool destroyThisDataset, FileObjectStore<IntArrayFormatter> newBinsCache)
{
#endif
int[] queryIndices = docIndices.Select(d => DocToQuery[d]).ToArray();
ulong[] uniqueQueryIds = queryIndices.Distinct().Select(q => QueryIds[q]).ToArray();
// calculate boundaries
int[] boundaries = new int[uniqueQueryIds.Length + 1];
boundaries[0] = 0;
int queryIndex = 1;
for (int q = 1; q < queryIndices.Length; ++q)
{
if (queryIndices[q] != queryIndices[q - 1])
boundaries[queryIndex++] = q;
}
boundaries[uniqueQueryIds.Length] = queryIndices.Length;
// construct skeleton
DatasetSkeleton datasetSkeleton = new DatasetSkeleton(docIndices.Select(d => Ratings[d]).ToArray(),
boundaries,
uniqueQueryIds,
docIndices.Select(d => DocIds[d]).ToArray());
// create features
FeatureFlockBase[] features = new FeatureFlockBase[NumFlocks];
int[][] assignment = new int[][] { docIndices };
Parallel.For(0, NumFlocks, new ParallelOptions { MaxDegreeOfParallelism = BlockingThreadPool.NumThreads },
(int flockIndex) =>
{
#if !NO_STORE
GetSubDataset_ThreadWorker(features, flockIndex, assignment, destroyThisDataset,newBinsCache);
#else
GetSubDatasetThreadWorker(features, flockIndex, assignment, destroyThisDataset);
#endif
});
uint[] filteredDupeIds = null;
// Filter the dupe ids, if any
if (DupeIds != null)
{
uint[] dupeIds = DupeIds;
filteredDupeIds = docIndices.Select(i => dupeIds[i]).ToArray();
}
// auxiliary data
Dictionary<string, DatasetSkeletonQueryDocData> auxData = _datasetSkeleton.AuxiliaryData;
Dictionary<string, DatasetSkeletonQueryDocData> newAuxData = new Dictionary<string, DatasetSkeletonQueryDocData>();
foreach (KeyValuePair<string, DatasetSkeletonQueryDocData> pair in auxData)
{
newAuxData[pair.Key] = pair.Value.GetSubset(pair.Value.IsQueryLevel ? queryIndices.Distinct().ToArray() : docIndices);
}
datasetSkeleton.AuxiliaryData = newAuxData;
// create new Dataset
Dataset dataset = new Dataset(datasetSkeleton, features);
dataset.DupeIds = filteredDupeIds;
return dataset;
}
#if !NO_STORE
private void GetSubDataset_ThreadWorker(DerivedFeature[] features, int f, int[][] docAssignment, bool destroyThisDataset, FileObjectStore<IntArrayFormatter> newBinsCache)
{
features[f] = Features[f].Split(docAssignment)[0];
features[f].BinsCache = newBinsCache;
if (newBinsCache != null)
{
features[f].Bins = null;
}
if (destroyThisDataset)
Features[f] = null;
}
#else
private void GetSubDatasetThreadWorker(FeatureFlockBase[] features, int f, int[][] docAssignment, bool destroyThisDataset)
{
features[f] = Flocks[f].Split(docAssignment)[0];
if (destroyThisDataset)
Flocks[f] = null;
}
#endif
private void SplitThreadWorker(FeatureFlockBase[][] features, int f, int[][] docAssignment, bool destroyThisDataset)
{
FeatureFlockBase[] featureParts = Flocks[f].Split(docAssignment);
for (int i = 0; i < docAssignment.Length; ++i)
features[i][f] = featureParts[i];
if (destroyThisDataset)
Flocks[f] = null;
}
/// <summary>
/// Returns a row-wise forward indexer across multiple features in the dataset.
/// </summary>
/// <param name="activeFeatures">Boolean array indicating active features, or null to
/// indicate all features should be used</param>
/// <returns>Row forward indexer</returns>
public RowForwardIndexer GetFeatureBinRowwiseIndexer(bool[] activeFeatures = null)
{
Contracts.Assert(activeFeatures == null || activeFeatures.Length >= NumFeatures);
var truncatedActiveFeatures = Enumerable.Repeat(true, NumFeatures).ToArray();
if (activeFeatures != null)
Array.Copy(activeFeatures, 0, truncatedActiveFeatures, 0, NumFeatures);
return new RowForwardIndexer(this, truncatedActiveFeatures);
}
public struct DatasetSkeletonQueryDocData
{
public bool IsQueryLevel; // Either query or document level.
public Array Data;
public DatasetSkeletonQueryDocData GetSubset(int[] docArray)
{
DatasetSkeletonQueryDocData qdd = new DatasetSkeletonQueryDocData();
qdd.IsQueryLevel = IsQueryLevel;
Type arrayDataType = Data.GetType().GetElementType();
qdd.Data = Array.CreateInstance(arrayDataType, docArray.Length);
for (int i = 0; i < docArray.Length; ++i)
qdd.Data.SetValue(Data.GetValue(docArray[i]), i);
return qdd;
}
}
/// <summary>
/// A class that contains all of the feature-independent data of the dataset
/// </summary>
public sealed class DatasetSkeleton
{
private readonly short[] _ratings;
public readonly int[] Boundaries;
public readonly ulong[] QueryIds;
public readonly ulong[] DocIds;
public double[][] MaxDcg;
private readonly int[] _docToQuery;
public Dictionary<string, DatasetSkeletonQueryDocData> AuxiliaryData { get; set; }
/// <summary>
/// Initializes a new instance of the <see cref="DatasetSkeleton"/> class.
/// </summary>
/// <param name="ratings"></param>
/// <param name="boundaries">The boundaries.</param>
/// <param name="queryIds">The query ids.</param>
/// <param name="docIds">The doc ids.</param>
/// <param name="actualTargets"></param>
public DatasetSkeleton(short[] ratings, int[] boundaries, ulong[] queryIds, ulong[] docIds, double[] actualTargets = null) :
this(ratings, boundaries, queryIds, docIds, MaxDcgRange(ratings, boundaries, 10), actualTargets)
{ }
/// <summary>
/// Initializes a new instance of the <see cref="DatasetSkeleton"/> class.
/// </summary>
/// <param name="ratings">The ratings.</param>
/// <param name="boundaries">The boundaries.</param>
/// <param name="queryIds">The query ids.</param>
/// <param name="docIds">The doc ids.</param>
/// <param name="maxDcg">The vector of maxDCG.</param>
/// <param name="actualTargets"></param>
public DatasetSkeleton(short[] ratings, int[] boundaries, ulong[] queryIds, ulong[] docIds, double[][] maxDcg, double[] actualTargets = null)
{
AuxiliaryData = new Dictionary<string, DatasetSkeletonQueryDocData>();
_ratings = ratings;
if (actualTargets != null)
ActualTargets = actualTargets;
else
{
ActualTargets = new double[_ratings.Length];
for (int i = 0; i < ActualTargets.Length; i++)
ActualTargets[i] = (double)_ratings[i];
}
Boundaries = boundaries;
QueryIds = queryIds;
DocIds = docIds;
MaxDcg = maxDcg;
// check that the arguments are consistent
CheckConsistency();
// create docToQuery
_docToQuery = new int[docIds.Length];
for (int q = 0; q < queryIds.Length; ++q)
{
for (int d = boundaries[q]; d < boundaries[q + 1]; ++d)
{
_docToQuery[d] = q;
}
}
}
public DatasetSkeleton(byte[] buffer, ref int position)
{
AuxiliaryData = new Dictionary<string, DatasetSkeletonQueryDocData>();
using (Timer.Time(TimerEvent.ConstructFromByteArray))
{
_ratings = buffer.ToShortArray(ref position);
Boundaries = buffer.ToIntArray(ref position);
QueryIds = buffer.ToULongArray(ref position);
DocIds = buffer.ToULongArray(ref position);
MaxDcg = buffer.ToDoubleJaggedArray(ref position);
_docToQuery = buffer.ToIntArray(ref position);
}
}
/// <summary>
/// Checks the consistency of the DatasetSkeleton
/// </summary>
private void CheckConsistency()
{
Contracts.Check(Ratings != null && Boundaries != null && QueryIds != null && DocIds != null && MaxDcg != null,
"DatasetSkeleton is missing a critical field");
Contracts.Check(Ratings.Length == DocIds.Length, "Length of label array does not match length of docID array");
Contracts.Check(Boundaries.Length == QueryIds.Length + 1, "Length of boundaries array does not match length of queryID array");
Contracts.Check(Utils.Size(MaxDcg) == 0 || Utils.Size(MaxDcg[0]) == QueryIds.Length, "Length of MaxDCG does not match number of queries");
}
public double[] ActualTargets
{
get;
private set;
}
public short[] Ratings
{
get { return _ratings; }
}
public int[] DocToQuery
{
get { return _docToQuery; }
}
public int NumDocs
{
get { return DocIds.Length; }
}
public int NumQueries
{
get { return QueryIds.Length; }
}
/// <summary>
/// Returns the number of bytes written by the member ToByteArray()
/// </summary>
public int SizeInBytes()
{
return Ratings.SizeInBytes()
+ Boundaries.SizeInBytes()
+ QueryIds.SizeInBytes()
+ DocIds.SizeInBytes()
+ MaxDcg.SizeInBytes()
+ DocToQuery.SizeInBytes();
}
/// <summary>
/// Writes a binary representation of this class to a byte buffer, at a given position.
/// The position is incremented to the end of the representation
/// </summary>
/// <param name="buffer">a byte array where the binary representation is written</param>
/// <param name="position">the position in the byte array</param>
public void ToByteArray(byte[] buffer, ref int position)
{
Ratings.ToByteArray(buffer, ref position);
Boundaries.ToByteArray(buffer, ref position);
QueryIds.ToByteArray(buffer, ref position);
DocIds.ToByteArray(buffer, ref position);
MaxDcg.ToByteArray(buffer, ref position);
DocToQuery.ToByteArray(buffer, ref position);
}
public byte[] ToByteArray()
{
int position = 0;
byte[] buffer = new byte[SizeInBytes()];
ToByteArray(buffer, ref position);
return buffer;
}
public int[][] GetAssignments(double[] fraction, int randomSeed, out int[][] assignment)
{
// make sure fractions sum to 1.0
if (Math.Abs(fraction.Sum() - 1.0) > 1e-6)
throw Contracts.Except("In Dataset.Split(), fractions must sum to 1.0");
// create a deterministic random number generator
Random rnd = new Random(randomSeed);
// get the number of parts and the number of queries in each part
int numParts = fraction.Length;
int[][] queries = null;
if (randomSeed >= 0)
{
int[] numQueries = fraction.Select(x => (int)(x * NumQueries)).ToArray(numParts);
numQueries[0] += NumQueries - numQueries.Sum();
// get a set of queries in each part
int[] perm = Utils.GetRandomPermutation(rnd, NumQueries);
queries = numQueries.Select(q => new int[q]).ToArray(numParts);
int posInPerm = 0;
for (int p = 0; p < numParts; ++p)
{
// skip empty parts
if (numQueries[p] == 0)
continue;
Array.Copy(perm, posInPerm, queries[p], 0, numQueries[p]);
Array.Sort(queries[p]);
posInPerm += numQueries[p];
}
}
else
{
// With negative random seeds, we do query-id dependent sampling.
PseudorandomFunction func = new PseudorandomFunction(rnd);
int[] thresh = new int[numParts];
int val;
int p;
double cumulative = 0.0;
for (int i = 0; i < numParts; ++i)
{
cumulative += fraction[i];
thresh[i] = (int)(cumulative * int.MaxValue);
if (fraction[i] == 0.0)
thresh[i]--;
}
List<int>[] listQueries = Enumerable.Range(0, numParts).Select(x => new List<int>()).ToArray(numParts);
for (int q = 0; q < NumQueries; ++q)
{
val = func.Apply(QueryIds[q]);
for (p = 0; p < numParts && val > thresh[p]; ++p)
;
listQueries[p].Add(q);
}
queries = listQueries.Select(x => x.ToArray()).ToArray(numParts);
}
// get the set of docs in each part
assignment = Enumerable.Range(0, numParts).Select(
p => queries[p].SelectMany(q => Enumerable.Range(Boundaries[q], Boundaries[q + 1] - Boundaries[q])).ToArray()
).ToArray(numParts);
return queries;
}
public DatasetSkeleton[] Split(double[] fraction, int randomSeed, out int[][] assignment)
{
int[][] queries = GetAssignments(fraction, randomSeed, out assignment);
int numParts = queries.Length;
// get boundaries
int[][] boundaries = queries.Select(q => new int[q.Length + 1]).ToArray(numParts);
for (int p = 0; p < numParts; ++p)
{
boundaries[p][0] = 0;
for (int q = 0; q < queries[p].Length; ++q)
{
boundaries[p][q + 1] = boundaries[p][q] + Boundaries[queries[p][q] + 1] - Boundaries[queries[p][q]];
}
}
// get docIds, queryIds, and labels
short[][] ratings = new short[numParts][];
ulong[][] queryIds = new ulong[numParts][];
ulong[][] docIds = new ulong[numParts][];
for (int p = 0; p < numParts; ++p)
{
ratings[p] = assignment[p].Select(d => Ratings[d]).ToArray();
queryIds[p] = queries[p].Select(q => QueryIds[q]).ToArray();
docIds[p] = assignment[p].Select(d => DocIds[d]).ToArray();
}
// package everything up in datasetSkeleton objects
DatasetSkeleton[] datasetSkeleton = Enumerable.Range(0, numParts).Select(
p => new DatasetSkeleton(ratings[p],
boundaries[p],
queryIds[p],
docIds[p])).ToArray(numParts);
// Do the auxiliary data.
foreach (KeyValuePair<string, DatasetSkeletonQueryDocData> pair in AuxiliaryData)
{
DatasetSkeletonQueryDocData qddata = pair.Value;
Type arrayDataType = qddata.Data.GetType().GetElementType();
for (int p = 0; p < numParts; ++p)
{
int[] mapping = (qddata.IsQueryLevel ? queries : assignment)[p];
Array newData = Array.CreateInstance(arrayDataType, mapping.Length);
for (int i = 0; i < mapping.Length; ++i)
newData.SetValue(qddata.Data.GetValue(mapping[i]), i);
datasetSkeleton[p].SetData(pair.Key, newData, qddata.IsQueryLevel);
}
}
return datasetSkeleton;
}
/// <summary>
/// Takes an array of DatasetSkeleton objects and concatenates them into one big DatasetSkeleton
/// </summary>
/// <param name="parts">An array of DatasetSkeletons</param>
/// <returns>A concatenated DatasetSkeleton</returns>
public static DatasetSkeleton Concat(DatasetSkeleton[] parts)
{
int concatNumDocs = parts.Sum(x => x.NumDocs);
int concatNumQueries = parts.Sum(x => x.NumQueries);
// allocate
short[] concatRatings = new short[concatNumDocs];
ulong[] concatDocIds = new ulong[concatNumDocs];
ulong[] concatQueryIds = new ulong[concatNumQueries];
int[] concatBoundaries = new int[concatNumQueries + 1];
// copy components into new arrays
int docBegin = 0;
int queryBegin = 0;
for (int p = 0; p < parts.Length; ++p)
{
int numDocs = parts[p].NumDocs;
int numQueries = parts[p].NumQueries;
Array.Copy(parts[p].Ratings, 0, concatRatings, docBegin, numDocs);
Array.Copy(parts[p].DocIds, 0, concatDocIds, docBegin, numDocs);
Array.Copy(parts[p].QueryIds, 0, concatQueryIds, queryBegin, numQueries);
for (int q = 0; q < numQueries; ++q)
concatBoundaries[queryBegin + q] = parts[p].Boundaries[q] + docBegin;
docBegin += numDocs;
queryBegin += numQueries;
}
concatBoundaries[queryBegin] = docBegin;
DatasetSkeleton skel = new DatasetSkeleton(concatRatings, concatBoundaries, concatQueryIds, concatDocIds);
SetConcatenatedAuxiliaryData(parts, skel);
return skel;
}
private static double[] _labelMap = new double[] { 0.0, 3.0, 7.0, 15.0, 31.0 };
private static readonly double[] _discountMap = new double[] { 1.44269504, 0.91023922, 0.72134752, 0.62133493, 0.55811062, 0.51389834, 0.48089834, 0.45511961, 0.43429448, 0.41703239, 0.40242960 };
public static double[] LabelGainMap
{
get { return _labelMap; }
set { _labelMap = value; }
}
/// <summary>
/// Calculates natural-based max DCG at all truncations from 1 to trunc
/// </summary>
/// <param name="labels">vector of labels</param>
/// <param name="boundaries">vector of query boundaries</param>
/// <param name="trunc">max truncation</param>
private static double[][] MaxDcgRange(short[] labels, int[] boundaries, int trunc)
{
double[][] maxAtN = Enumerable.Range(0, trunc).Select(x => new double[boundaries.Length - 1]).ToArray(trunc);
int relevancyLevel = _labelMap.Length;
int[] labelCounts = new int[relevancyLevel];
for (int q = 0; q < boundaries.Length - 1; ++q)
{
int maxTrunc = Math.Min(trunc, boundaries[q + 1] - boundaries[q]);
if (maxTrunc == 0)
{
for (int t = 0; t < trunc; t++)
maxAtN[t][q] = double.NaN;
continue;
}
Array.Clear(labelCounts, 0, relevancyLevel);
for (int l = boundaries[q]; l < boundaries[q + 1]; l++)
{
short label = labels[l];
labelCounts[label]++;
}
int topLabel = relevancyLevel - 1;
while (labelCounts[topLabel] == 0)
topLabel--;
maxAtN[0][q] = _labelMap[topLabel] * _discountMap[0];
labelCounts[topLabel]--;
for (int t = 1; t < maxTrunc; t++)
{
while (labelCounts[topLabel] == 0)
topLabel--;
maxAtN[t][q] = maxAtN[t - 1][q] + _labelMap[topLabel] * _discountMap[t];
labelCounts[topLabel]--;
}
for (int t = maxTrunc; t < trunc; t++)
{
maxAtN[t][q] = maxAtN[t - 1][q];
}
}
return maxAtN;
}
public void RecomputeMaxDcg(int truncationLevel)
{
MaxDcg = null;
MaxDcg = MaxDcgRange(Ratings, Boundaries, truncationLevel);
}
/// <summary>
/// Given the auxiliary data in a bunch of parts, set the concatenated dataset appropriately.
/// </summary>
/// <param name="parts">The individual parts of the dataset</param>
/// <param name="concat">The concatenated version of this dataset</param>
private static void SetConcatenatedAuxiliaryData(DatasetSkeleton[] parts, DatasetSkeleton concat)
{
// Get the union of all the auxiliary data names.
Dictionary<string, bool> auxNames = new Dictionary<string, bool>();
foreach (DatasetSkeleton part in parts)
{
foreach (string name in part.AuxiliaryData.Keys)
{
auxNames[name] = true;
}
}
DatasetSkeletonQueryDocData[] partsDatas = new DatasetSkeletonQueryDocData[parts.Length];
int[] docLengths = parts.Select(x => x.NumDocs).ToArray();
int[] queryLengths = parts.Select(x => x.NumQueries).ToArray();
foreach (string name in auxNames.Keys)
{
for (int p = 0; p < parts.Length; ++p)
{
partsDatas[p] = parts[p].AuxiliaryData.ContainsKey(name) ? parts[p].AuxiliaryData[name] : default(DatasetSkeletonQueryDocData);
}
bool isQuery = partsDatas.First(pd => pd.Data != null).IsQueryLevel;
if (partsDatas.Any(pd => pd.Data != null && pd.IsQueryLevel != isQuery))
{
throw Contracts.Except("On auxiliary data {0}, disagreement on whether this is query/doc", name);
}
Array concatArray = ConcatArrays(partsDatas.Select(pd => pd.Data).ToArray(), isQuery ? queryLengths : docLengths, name);
concat.SetData(name, concatArray, isQuery);
}
}
private static Array ConcatArrays(Array[] arrays, int[] lengths, string name)
{
// If all arrays are null (or there are no arrays), then the concat vector is null.
if (arrays.All(x => x == null))
return null;
// What is the total length?
int newLength = lengths.Sum();
// What is the type of these?
Type t = arrays.First(x => x != null).GetType().GetElementType();
if (arrays.Any(x => x != null && t != x.GetType().GetElementType()))
{
IEnumerable<string> typeNameEnumerable = arrays.Select(x => x.GetType().GetElementType()).Distinct().Select(x => x.Name).OrderBy(n => n);
throw Contracts.Except("When combining auxiliary data, the types of elements must match. Distinct types {0} detected for data named {1}",
String.Join(", ", typeNameEnumerable), name);
}
Array a = Array.CreateInstance(t, newLength);
int start = 0;
for (int i = 0; i < lengths.Length; ++i)
{
if (arrays[i] != null)
Array.Copy(arrays[i], 0, a, start, lengths[i]);
start += lengths[i];
}
return a;
}
/// <summary>
/// Sets some named query or document level auxiliary data.
/// </summary>
/// <param name="name">The name of the parameter</param>
/// <param name="array"></param>
/// <param name="queryLevel"></param>
public void SetData(string name, Array array, bool queryLevel)
{
int shouldHaveLength = queryLevel ? NumQueries : NumDocs;
if (array.Length != shouldHaveLength)
{
throw Contracts.Except(
"Input array for {0} had {1} elements, ought to have {2}",
name, array.Length, shouldHaveLength);
}
DatasetSkeletonQueryDocData dd;
dd.Data = array;
dd.IsQueryLevel = queryLevel;
AuxiliaryData[name] = dd;
}
/// <summary>
/// Retrieves some auxiliary data previously set to this skeleton.
/// </summary>
/// <typeparam name="T">The type of the array, which should match the type passed in</typeparam>
public T[] GetData<T>(string name)
{
if (!AuxiliaryData.ContainsKey(name))
return null;
return (T[])AuxiliaryData[name].Data;
}
private static string SampleWeightsSetName { get { return "SampleWeights"; } }
public double[] SampleWeights
{
get { return GetData<double>(SampleWeightsSetName); }
set
{
if (value == null)
{
if (AuxiliaryData.ContainsKey(SampleWeightsSetName))
{
AuxiliaryData.Remove(SampleWeightsSetName);
}
return;
}
SetData(SampleWeightsSetName, value, false);
}
}
}
/// <summary>
/// Structure allowing forward indexing by row, across multiple features in the dataset.
/// </summary>
public sealed class RowForwardIndexer
{
private readonly Dataset _dataset;
private readonly FeatureFlockBase.FlockForwardIndexerBase[] _flockIndexers;
public readonly struct Row
{
private readonly RowForwardIndexer _indexer;
private readonly int _rowIndex;
/// <summary>
/// Indexes the value of a feature for this row.
/// </summary>
/// <param name="featureIndex">The feature index</param>
/// <returns>The binned valued of a feature for this row</returns>
public int this[int featureIndex]
{
get
{
int flock;
int subfeature;
_indexer._dataset.MapFeatureToFlockAndSubFeature(featureIndex, out flock, out subfeature);
Contracts.AssertValue(_indexer._flockIndexers[flock]);
return _indexer._flockIndexers[flock][subfeature, _rowIndex];
}
}
public Row(RowForwardIndexer indexer, int rowIndex)
{
Contracts.AssertValue(indexer);
Contracts.Assert(0 <= rowIndex && rowIndex < indexer._dataset.NumDocs);
_indexer = indexer;
_rowIndex = rowIndex;
}
}
/// <summary>
/// Constructor.
/// </summary>
/// <param name="dataset">The dataset to create the indexer over</param>
/// <param name="active">Either null to indicate all columns should be active, or
/// a boolean array of length equal to the number of features that should be active</param>
public RowForwardIndexer(Dataset dataset, bool[] active = null)
{
Contracts.AssertValue(dataset);
Contracts.Assert(active == null || active.Length == dataset.NumFeatures);
_dataset = dataset;
if (active == null)
_flockIndexers = _dataset._flocks.Select(d => d.GetFlockIndexer()).ToArray(_dataset.NumFlocks);
else
{
// We have an actives array.
_flockIndexers = new FeatureFlockBase.FlockForwardIndexerBase[_dataset.NumFlocks];
for (int iflock = 0; iflock < _dataset.NumFlocks; ++iflock)
{
var flock = _dataset._flocks[iflock];
int offset = _dataset._flockToFirstFeature[iflock];
for (int i = 0; i < flock.Count; ++i)
{
if (active[i + offset])
{
_flockIndexers[iflock] = flock.GetFlockIndexer();
break;
}
}
}
// This assert uses a slower but more intuitive test to verify the correctness of the above code.
Contracts.Assert(Enumerable.Range(0, _dataset.NumFlocks).All(f =>
Enumerable.Range(_dataset._flockToFirstFeature[f], _dataset._flocks[f].Count).Any(i => active[i]) ==
(_flockIndexers[f] != null)));
}
}
public Row this[int row] { get { return new Row(this, row); } }
}
}
}
|