cbmc/specgen/util/token_counter.py

try:
    import tiktoken
    TIKTOKEN_AVAILABLE = True
except ImportError:
    TIKTOKEN_AVAILABLE = False

def count_str_token(string: str) -> int:
    """Returns the number of tokens in a text string."""
    if TIKTOKEN_AVAILABLE:
        encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
        num_tokens = len(encoding.encode(string))
        return num_tokens
    else:
        # Fallback heuristic: approximately 4 characters per token for English text
        return len(string) // 4

def count_deepseek_tokens(text: str) -> int:
    """Estimate tokens for DeepSeek API using heuristic approach."""
    # DeepSeek token estimation: approximately 4 characters per token
    return len(text) // 4

def count_config_token(config) -> int:
    sum = 0
    # Check if this is a DeepSeek config
    if config.get('model', '').startswith('deepseek'):
        for message in config['messages']:
            sum += count_deepseek_tokens(message['content'])
    else:
        for message in config['messages']:
            sum += count_str_token(message['content'])
    return sum