You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
cbmc/specgen/util/token_counter.py

31 lines
1.1 KiB

try:
import tiktoken
TIKTOKEN_AVAILABLE = True
except ImportError:
TIKTOKEN_AVAILABLE = False
def count_str_token(string: str) -> int:
"""Returns the number of tokens in a text string."""
if TIKTOKEN_AVAILABLE:
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
num_tokens = len(encoding.encode(string))
return num_tokens
else:
# Fallback heuristic: approximately 4 characters per token for English text
return len(string) // 4
def count_deepseek_tokens(text: str) -> int:
"""Estimate tokens for DeepSeek API using heuristic approach."""
# DeepSeek token estimation: approximately 4 characters per token
return len(text) // 4
def count_config_token(config) -> int:
sum = 0
# Check if this is a DeepSeek config
if config.get('model', '').startswith('deepseek'):
for message in config['messages']:
sum += count_deepseek_tokens(message['content'])
else:
for message in config['messages']:
sum += count_str_token(message['content'])
return sum