95 references to ModelEncoding
Microsoft.ML.Tokenizers (95)
Model\TiktokenTokenizer.cs (95)
1040private static readonly (string Prefix, ModelEncoding Encoding)[] _modelPrefixToEncoding = 1042( "o1-", ModelEncoding.O200kBase ), // e.g. o1-mini 1043( "o3-", ModelEncoding.O200kBase ), // e.g. o3-mini 1044( "o4-mini-", ModelEncoding.O200kBase ), // e.g. o4-mini 1047( "gpt-5.3-", ModelEncoding.O200kBase ), 1048( "gpt-5.2-", ModelEncoding.O200kBase ), 1049( "gpt-5.1-", ModelEncoding.O200kBase ), 1050( "gpt-5-", ModelEncoding.O200kBase ), 1051( "gpt-4.1-", ModelEncoding.O200kBase ), // e.g., gpt-4.1-mini 1052( "gpt-4.5-", ModelEncoding.O200kBase ), // e.g., gpt-4.5 1053( "gpt-4o-", ModelEncoding.O200kBase ), // e.g., gpt-4o-2024-05-13 1054( "chatgpt-4o-", ModelEncoding.O200kBase ), 1055( "gpt-4-", ModelEncoding.Cl100kBase ), // e.g., gpt-4-0314, etc., plus gpt-4-32k 1056( "gpt-3.5-", ModelEncoding.Cl100kBase ), // e.g, gpt-3.5-turbo-0301, -0401, etc. 1057( "gpt-35-", ModelEncoding.Cl100kBase ), // Azure deployment name 1058( "gpt-oss-", ModelEncoding.O200kHarmony ), 1061( "ft:gpt-4o", ModelEncoding.O200kBase ), 1062( "ft:gpt-4", ModelEncoding.Cl100kBase ), 1063( "ft:gpt-3.5-turbo", ModelEncoding.Cl100kBase ), 1064( "ft:davinci-002", ModelEncoding.Cl100kBase ), 1065( "ft:babbage-002", ModelEncoding.Cl100kBase ), 1068private static readonly Dictionary<string, ModelEncoding> _modelToEncoding = 1069new Dictionary<string, ModelEncoding>(StringComparer.OrdinalIgnoreCase) 1072{ "o1", ModelEncoding.O200kBase }, 1073{ "o3", ModelEncoding.O200kBase }, 1074{ "o4-mini", ModelEncoding.O200kBase }, 1077{ "gpt-5.3", ModelEncoding.O200kBase }, 1078{ "gpt-5.2", ModelEncoding.O200kBase }, 1079{ "gpt-5.1", ModelEncoding.O200kBase }, 1080{ "gpt-5", ModelEncoding.O200kBase }, 1081{ "gpt-4.1", ModelEncoding.O200kBase }, 1082{ "gpt-4o", ModelEncoding.O200kBase }, 1083{ "gpt-4", ModelEncoding.Cl100kBase }, 1084{ "gpt-3.5-turbo", ModelEncoding.Cl100kBase }, 1085{ "gpt-3.5", ModelEncoding.Cl100kBase }, 1086{ "gpt-3.5-turbo-16k", ModelEncoding.Cl100kBase }, 1087{ "gpt-35", ModelEncoding.Cl100kBase }, // Azure deployment name 1088{ "gpt-35-turbo", ModelEncoding.Cl100kBase }, // Azure deployment name 1089{ "gpt-35-turbo-16k", ModelEncoding.Cl100kBase }, // Azure deployment name 1092{ "davinci-002", ModelEncoding.Cl100kBase }, 1093{ "babbage-002", ModelEncoding.Cl100kBase }, 1097{ "text-embedding-ada-002", ModelEncoding.Cl100kBase }, 1098{ "text-embedding-3-small", ModelEncoding.Cl100kBase }, 1099{ "text-embedding-3-large", ModelEncoding.Cl100kBase }, 1103{ "text-davinci-003", ModelEncoding.P50kBase }, 1104{ "text-davinci-002", ModelEncoding.P50kBase }, 1105{ "text-davinci-001", ModelEncoding.R50kBase }, 1106{ "text-curie-001", ModelEncoding.R50kBase }, 1107{ "text-babbage-001", ModelEncoding.R50kBase }, 1108{ "text-ada-001", ModelEncoding.R50kBase }, 1109{ "davinci", ModelEncoding.R50kBase }, 1110{ "curie", ModelEncoding.R50kBase }, 1111{ "babbage", ModelEncoding.R50kBase }, 1112{ "ada", ModelEncoding.R50kBase }, 1115{ "code-davinci-002", ModelEncoding.P50kBase }, 1116{ "code-davinci-001", ModelEncoding.P50kBase }, 1117{ "code-cushman-002", ModelEncoding.P50kBase }, 1118{ "code-cushman-001", ModelEncoding.P50kBase }, 1119{ "davinci-codex", ModelEncoding.P50kBase }, 1120{ "cushman-codex", ModelEncoding.P50kBase }, 1123{ "text-davinci-edit-001", ModelEncoding.P50kEdit }, 1124{ "code-davinci-edit-001", ModelEncoding.P50kEdit }, 1128{ "text-similarity-davinci-001", ModelEncoding.R50kBase }, 1129{ "text-similarity-curie-001", ModelEncoding.R50kBase }, 1130{ "text-similarity-babbage-001", ModelEncoding.R50kBase }, 1131{ "text-similarity-ada-001", ModelEncoding.R50kBase }, 1132{ "text-search-davinci-doc-001", ModelEncoding.R50kBase }, 1133{ "text-search-curie-doc-001", ModelEncoding.R50kBase }, 1134{ "text-search-babbage-doc-001", ModelEncoding.R50kBase }, 1135{ "text-search-ada-doc-001", ModelEncoding.R50kBase }, 1136{ "code-search-babbage-code-001", ModelEncoding.R50kBase }, 1137{ "code-search-ada-code-001", ModelEncoding.R50kBase }, 1140{ "gpt2", ModelEncoding.GPT2 }, 1141{ "gpt-2", ModelEncoding.GPT2 }, 1144{ Phi4ModelName, ModelEncoding.Cl100kBase }, 1147private static ModelEncoding GetModelEncoding(string modelName) 1149if (!_modelToEncoding.TryGetValue(modelName, out ModelEncoding encoder)) 1151foreach ((string Prefix, ModelEncoding Encoding) in _modelPrefixToEncoding) 1161if (encoder == ModelEncoding.None) 1197private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) GetTiktokenConfigurations(ModelEncoding modelEncoding, string? modelName = null) 1201case ModelEncoding.Cl100kBase: 1210case ModelEncoding.GPT2: 1213case ModelEncoding.O200kBase: 1216case ModelEncoding.P50kBase: 1219case ModelEncoding.P50kEdit: 1223case ModelEncoding.R50kBase: 1226case ModelEncoding.O200kHarmony: 1292ModelEncoding modelEncoding, 1538ModelEncoding modelEncoding; 1541modelEncoding = ModelEncoding.Cl100kBase; 1545modelEncoding = ModelEncoding.O200kBase; 1549modelEncoding = ModelEncoding.O200kHarmony; 1553modelEncoding = ModelEncoding.P50kBase; 1557modelEncoding = ModelEncoding.P50kEdit; 1561modelEncoding = ModelEncoding.R50kBase;