72 references to ModelEncoding
Microsoft.ML.Tokenizers (72)
Model\TiktokenTokenizer.cs (72)
1030private static readonly (string Prefix, ModelEncoding Encoding)[] _modelPrefixToEncoding = 1033( "o1-", ModelEncoding.O200kBase ), // e.g. o1-mini 1034( "o3-", ModelEncoding.O200kBase ), // e.g. o3-mini 1035( "gpt-4.1-", ModelEncoding.O200kBase), // e.g., gpt-4.1-mini 1036( "gpt-4o-", ModelEncoding.O200kBase), // e.g., gpt-4o-2024-05-13 1037( "gpt-4-", ModelEncoding.Cl100kBase), // e.g., gpt-4-0314, etc., plus gpt-4-32k 1038( "gpt-3.5-", ModelEncoding.Cl100kBase), // e.g, gpt-3.5-turbo-0301, -0401, etc. 1039( "gpt-35-", ModelEncoding.Cl100kBase ) // Azure deployment name 1042private static readonly Dictionary<string, ModelEncoding> _modelToEncoding = 1043new Dictionary<string, ModelEncoding>(StringComparer.OrdinalIgnoreCase) 1046{ "gpt-4o", ModelEncoding.O200kBase }, 1047{ "o1", ModelEncoding.O200kBase }, 1048{ "o3", ModelEncoding.O200kBase }, 1049{ "o4-mini", ModelEncoding.O200kBase }, 1050{ "gpt-4.1", ModelEncoding.O200kBase }, 1051{ "gpt-4", ModelEncoding.Cl100kBase }, 1052{ "gpt-3.5-turbo", ModelEncoding.Cl100kBase }, 1053{ "gpt-3.5-turbo-16k", ModelEncoding.Cl100kBase }, 1054{ "gpt-35", ModelEncoding.Cl100kBase }, // Azure deployment name 1055{ "gpt-35-turbo", ModelEncoding.Cl100kBase }, // Azure deployment name 1056{ "gpt-35-turbo-16k", ModelEncoding.Cl100kBase }, // Azure deployment name 1059{ "text-davinci-003", ModelEncoding.P50kBase }, 1060{ "text-davinci-002", ModelEncoding.P50kBase }, 1061{ "text-davinci-001", ModelEncoding.R50kBase }, 1062{ "text-curie-001", ModelEncoding.R50kBase }, 1063{ "text-babbage-001", ModelEncoding.R50kBase }, 1064{ "text-ada-001", ModelEncoding.R50kBase }, 1065{ "davinci", ModelEncoding.R50kBase }, 1066{ "curie", ModelEncoding.R50kBase }, 1067{ "babbage", ModelEncoding.R50kBase }, 1068{ "ada", ModelEncoding.R50kBase }, 1071{ "code-davinci-002", ModelEncoding.P50kBase }, 1072{ "code-davinci-001", ModelEncoding.P50kBase }, 1073{ "code-cushman-002", ModelEncoding.P50kBase }, 1074{ "code-cushman-001", ModelEncoding.P50kBase }, 1075{ "davinci-codex", ModelEncoding.P50kBase }, 1076{ "cushman-codex", ModelEncoding.P50kBase }, 1079{ "text-davinci-edit-001", ModelEncoding.P50kEdit }, 1080{ "code-davinci-edit-001", ModelEncoding.P50kEdit }, 1084{ "text-embedding-ada-002", ModelEncoding.Cl100kBase }, 1085{ "text-embedding-3-small", ModelEncoding.Cl100kBase }, 1086{ "text-embedding-3-large", ModelEncoding.Cl100kBase }, 1089{ "text-similarity-davinci-001", ModelEncoding.R50kBase }, 1090{ "text-similarity-curie-001", ModelEncoding.R50kBase }, 1091{ "text-similarity-babbage-001", ModelEncoding.R50kBase }, 1092{ "text-similarity-ada-001", ModelEncoding.R50kBase }, 1093{ "text-search-davinci-doc-001", ModelEncoding.R50kBase }, 1094{ "text-search-curie-doc-001", ModelEncoding.R50kBase }, 1095{ "text-search-babbage-doc-001", ModelEncoding.R50kBase }, 1096{ "text-search-ada-doc-001", ModelEncoding.R50kBase }, 1097{ "code-search-babbage-code-001", ModelEncoding.R50kBase }, 1098{ "code-search-ada-code-001", ModelEncoding.R50kBase }, 1101{ "gpt2", ModelEncoding.GPT2 }, 1104{ Phi4ModelName, ModelEncoding.Cl100kBase }, 1107private static ModelEncoding GetModelEncoding(string modelName) 1109if (!_modelToEncoding.TryGetValue(modelName, out ModelEncoding encoder)) 1111foreach ((string Prefix, ModelEncoding Encoding) in _modelPrefixToEncoding) 1121if (encoder == ModelEncoding.None) 1131private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) GetTiktokenConfigurations(ModelEncoding modelEncoding, string? modelName = null) 1135case ModelEncoding.Cl100kBase: 1144case ModelEncoding.GPT2: 1147case ModelEncoding.O200kBase: 1150case ModelEncoding.P50kBase: 1153case ModelEncoding.P50kEdit: 1157case ModelEncoding.R50kBase: 1222ModelEncoding modelEncoding, 1468ModelEncoding modelEncoding; 1471modelEncoding = ModelEncoding.Cl100kBase; 1475modelEncoding = ModelEncoding.O200kBase; 1479modelEncoding = ModelEncoding.P50kBase; 1483modelEncoding = ModelEncoding.P50kEdit; 1487modelEncoding = ModelEncoding.R50kBase;