Model\TiktokenTokenizer.cs (89)
1040private static readonly (string Prefix, ModelEncoding Encoding)[] _modelPrefixToEncoding =
1042( "o1-", ModelEncoding.O200kBase ), // e.g. o1-mini
1043( "o3-", ModelEncoding.O200kBase ), // e.g. o3-mini
1044( "o4-mini-", ModelEncoding.O200kBase ), // e.g. o4-mini
1047( "gpt-5-", ModelEncoding.O200kBase),
1048( "gpt-4.1-", ModelEncoding.O200kBase), // e.g., gpt-4.1-mini
1049( "gpt-4.5-", ModelEncoding.O200kBase), // e.g., gpt-4.5
1050( "gpt-4o-", ModelEncoding.O200kBase), // e.g., gpt-4o-2024-05-13
1051( "chatgpt-4o-", ModelEncoding.O200kBase),
1052( "gpt-4-", ModelEncoding.Cl100kBase), // e.g., gpt-4-0314, etc., plus gpt-4-32k
1053( "gpt-3.5-", ModelEncoding.Cl100kBase), // e.g, gpt-3.5-turbo-0301, -0401, etc.
1054( "gpt-35-", ModelEncoding.Cl100kBase ), // Azure deployment name
1055( "gpt-oss-", ModelEncoding.O200kHarmony ),
1058( "ft:gpt-4o", ModelEncoding.O200kBase ),
1059( "ft:gpt-4", ModelEncoding.Cl100kBase ),
1060( "ft:gpt-3.5-turbo", ModelEncoding.Cl100kBase ),
1061( "ft:davinci-002", ModelEncoding.Cl100kBase ),
1062( "ft:babbage-002", ModelEncoding.Cl100kBase ),
1065private static readonly Dictionary<string, ModelEncoding> _modelToEncoding =
1066new Dictionary<string, ModelEncoding>(StringComparer.OrdinalIgnoreCase)
1069{ "o1", ModelEncoding.O200kBase },
1070{ "o3", ModelEncoding.O200kBase },
1071{ "o4-mini", ModelEncoding.O200kBase },
1074{ "gpt-5", ModelEncoding.O200kBase },
1075{ "gpt-4.1", ModelEncoding.O200kBase },
1076{ "gpt-4o", ModelEncoding.O200kBase },
1077{ "gpt-4", ModelEncoding.Cl100kBase },
1078{ "gpt-3.5-turbo", ModelEncoding.Cl100kBase },
1079{ "gpt-3.5", ModelEncoding.Cl100kBase },
1080{ "gpt-3.5-turbo-16k", ModelEncoding.Cl100kBase },
1081{ "gpt-35", ModelEncoding.Cl100kBase }, // Azure deployment name
1082{ "gpt-35-turbo", ModelEncoding.Cl100kBase }, // Azure deployment name
1083{ "gpt-35-turbo-16k", ModelEncoding.Cl100kBase }, // Azure deployment name
1086{ "davinci-002", ModelEncoding.Cl100kBase },
1087{ "babbage-002", ModelEncoding.Cl100kBase },
1091{ "text-embedding-ada-002", ModelEncoding.Cl100kBase },
1092{ "text-embedding-3-small", ModelEncoding.Cl100kBase },
1093{ "text-embedding-3-large", ModelEncoding.Cl100kBase },
1097{ "text-davinci-003", ModelEncoding.P50kBase },
1098{ "text-davinci-002", ModelEncoding.P50kBase },
1099{ "text-davinci-001", ModelEncoding.R50kBase },
1100{ "text-curie-001", ModelEncoding.R50kBase },
1101{ "text-babbage-001", ModelEncoding.R50kBase },
1102{ "text-ada-001", ModelEncoding.R50kBase },
1103{ "davinci", ModelEncoding.R50kBase },
1104{ "curie", ModelEncoding.R50kBase },
1105{ "babbage", ModelEncoding.R50kBase },
1106{ "ada", ModelEncoding.R50kBase },
1109{ "code-davinci-002", ModelEncoding.P50kBase },
1110{ "code-davinci-001", ModelEncoding.P50kBase },
1111{ "code-cushman-002", ModelEncoding.P50kBase },
1112{ "code-cushman-001", ModelEncoding.P50kBase },
1113{ "davinci-codex", ModelEncoding.P50kBase },
1114{ "cushman-codex", ModelEncoding.P50kBase },
1117{ "text-davinci-edit-001", ModelEncoding.P50kEdit },
1118{ "code-davinci-edit-001", ModelEncoding.P50kEdit },
1122{ "text-similarity-davinci-001", ModelEncoding.R50kBase },
1123{ "text-similarity-curie-001", ModelEncoding.R50kBase },
1124{ "text-similarity-babbage-001", ModelEncoding.R50kBase },
1125{ "text-similarity-ada-001", ModelEncoding.R50kBase },
1126{ "text-search-davinci-doc-001", ModelEncoding.R50kBase },
1127{ "text-search-curie-doc-001", ModelEncoding.R50kBase },
1128{ "text-search-babbage-doc-001", ModelEncoding.R50kBase },
1129{ "text-search-ada-doc-001", ModelEncoding.R50kBase },
1130{ "code-search-babbage-code-001", ModelEncoding.R50kBase },
1131{ "code-search-ada-code-001", ModelEncoding.R50kBase },
1134{ "gpt2", ModelEncoding.GPT2 },
1135{ "gpt-2", ModelEncoding.GPT2 },
1138{ Phi4ModelName, ModelEncoding.Cl100kBase },
1141private static ModelEncoding GetModelEncoding(string modelName)
1143if (!_modelToEncoding.TryGetValue(modelName, out ModelEncoding encoder))
1145foreach ((string Prefix, ModelEncoding Encoding) in _modelPrefixToEncoding)
1155if (encoder == ModelEncoding.None)
1191private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) GetTiktokenConfigurations(ModelEncoding modelEncoding, string? modelName = null)
1195case ModelEncoding.Cl100kBase:
1204case ModelEncoding.GPT2:
1207case ModelEncoding.O200kBase:
1210case ModelEncoding.P50kBase:
1213case ModelEncoding.P50kEdit:
1217case ModelEncoding.R50kBase:
1220case ModelEncoding.O200kHarmony:
1286ModelEncoding modelEncoding,
1532ModelEncoding modelEncoding;
1535modelEncoding = ModelEncoding.Cl100kBase;
1539modelEncoding = ModelEncoding.O200kBase;
1543modelEncoding = ModelEncoding.O200kHarmony;
1547modelEncoding = ModelEncoding.P50kBase;
1551modelEncoding = ModelEncoding.P50kEdit;
1555modelEncoding = ModelEncoding.R50kBase;