Model\TiktokenTokenizer.cs (69)
1030private static readonly (string Prefix, ModelEncoding Encoding)[] _modelPrefixToEncoding =
1033( "o1-", ModelEncoding.O200kBase ), // e.g. o1-mini
1034( "o3-", ModelEncoding.O200kBase ), // e.g. o3-mini
1035( "gpt-4o-", ModelEncoding.O200kBase), // e.g., gpt-4o-2024-05-13
1036( "gpt-4-", ModelEncoding.Cl100kBase), // e.g., gpt-4-0314, etc., plus gpt-4-32k
1037( "gpt-3.5-", ModelEncoding.Cl100kBase), // e.g, gpt-3.5-turbo-0301, -0401, etc.
1038( "gpt-35-", ModelEncoding.Cl100kBase ) // Azure deployment name
1041private static readonly Dictionary<string, ModelEncoding> _modelToEncoding =
1042new Dictionary<string, ModelEncoding>(StringComparer.OrdinalIgnoreCase)
1045{ "gpt-4o", ModelEncoding.O200kBase },
1046{ "o1", ModelEncoding.O200kBase },
1047{ "o3", ModelEncoding.O200kBase },
1048{ "gpt-4", ModelEncoding.Cl100kBase },
1049{ "gpt-3.5-turbo", ModelEncoding.Cl100kBase },
1050{ "gpt-3.5-turbo-16k", ModelEncoding.Cl100kBase },
1051{ "gpt-35", ModelEncoding.Cl100kBase }, // Azure deployment name
1052{ "gpt-35-turbo", ModelEncoding.Cl100kBase }, // Azure deployment name
1053{ "gpt-35-turbo-16k", ModelEncoding.Cl100kBase }, // Azure deployment name
1056{ "text-davinci-003", ModelEncoding.P50kBase },
1057{ "text-davinci-002", ModelEncoding.P50kBase },
1058{ "text-davinci-001", ModelEncoding.R50kBase },
1059{ "text-curie-001", ModelEncoding.R50kBase },
1060{ "text-babbage-001", ModelEncoding.R50kBase },
1061{ "text-ada-001", ModelEncoding.R50kBase },
1062{ "davinci", ModelEncoding.R50kBase },
1063{ "curie", ModelEncoding.R50kBase },
1064{ "babbage", ModelEncoding.R50kBase },
1065{ "ada", ModelEncoding.R50kBase },
1068{ "code-davinci-002", ModelEncoding.P50kBase },
1069{ "code-davinci-001", ModelEncoding.P50kBase },
1070{ "code-cushman-002", ModelEncoding.P50kBase },
1071{ "code-cushman-001", ModelEncoding.P50kBase },
1072{ "davinci-codex", ModelEncoding.P50kBase },
1073{ "cushman-codex", ModelEncoding.P50kBase },
1076{ "text-davinci-edit-001", ModelEncoding.P50kEdit },
1077{ "code-davinci-edit-001", ModelEncoding.P50kEdit },
1081{ "text-embedding-ada-002", ModelEncoding.Cl100kBase },
1082{ "text-embedding-3-small", ModelEncoding.Cl100kBase },
1083{ "text-embedding-3-large", ModelEncoding.Cl100kBase },
1086{ "text-similarity-davinci-001", ModelEncoding.R50kBase },
1087{ "text-similarity-curie-001", ModelEncoding.R50kBase },
1088{ "text-similarity-babbage-001", ModelEncoding.R50kBase },
1089{ "text-similarity-ada-001", ModelEncoding.R50kBase },
1090{ "text-search-davinci-doc-001", ModelEncoding.R50kBase },
1091{ "text-search-curie-doc-001", ModelEncoding.R50kBase },
1092{ "text-search-babbage-doc-001", ModelEncoding.R50kBase },
1093{ "text-search-ada-doc-001", ModelEncoding.R50kBase },
1094{ "code-search-babbage-code-001", ModelEncoding.R50kBase },
1095{ "code-search-ada-code-001", ModelEncoding.R50kBase },
1098{ "gpt2", ModelEncoding.GPT2 },
1101{ Phi4ModelName, ModelEncoding.Cl100kBase },
1104private static ModelEncoding GetModelEncoding(string modelName)
1106if (!_modelToEncoding.TryGetValue(modelName, out ModelEncoding encoder))
1108foreach ((string Prefix, ModelEncoding Encoding) in _modelPrefixToEncoding)
1118if (encoder == ModelEncoding.None)
1128private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) GetTiktokenConfigurations(ModelEncoding modelEncoding, string? modelName = null)
1132case ModelEncoding.Cl100kBase:
1141case ModelEncoding.GPT2:
1144case ModelEncoding.O200kBase:
1147case ModelEncoding.P50kBase:
1150case ModelEncoding.P50kEdit:
1154case ModelEncoding.R50kBase:
1219ModelEncoding modelEncoding,
1465ModelEncoding modelEncoding;
1468modelEncoding = ModelEncoding.Cl100kBase;
1472modelEncoding = ModelEncoding.O200kBase;
1476modelEncoding = ModelEncoding.P50kBase;
1480modelEncoding = ModelEncoding.P50kEdit;
1484modelEncoding = ModelEncoding.R50kBase;