Source code for lexilux.chat.history

"""
Chat history management.

Provides ChatHistory class for managing conversation history with automatic extraction,
serialization, token counting, and truncation capabilities.
"""

from __future__ import annotations

import json
from collections.abc import MutableSequence, Sequence
from copy import deepcopy
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any

from lexilux.chat.models import ChatResult, MessagesLike
from lexilux.chat.utils import normalize_messages

if TYPE_CHECKING:
    from lexilux.tokenizer import Tokenizer


[docs] @dataclass class TokenAnalysis: """ Detailed token analysis result for conversation history. Provides comprehensive token statistics including totals, per-role breakdown, per-message details, and per-round analysis. Attributes: total_tokens: Total number of tokens across all messages. system_tokens: Number of tokens in system message (if present). user_tokens: Total tokens in all user messages. assistant_tokens: Total tokens in all assistant messages. total_messages: Total number of messages analyzed. system_messages: Number of system messages (0 or 1). user_messages: Number of user messages. assistant_messages: Number of assistant messages. per_message: List of (role, content_preview, tokens) tuples for each message. per_round: List of (round_index, round_tokens, user_tokens, assistant_tokens) tuples. average_tokens_per_message: Average tokens per message. average_tokens_per_round: Average tokens per round. max_message_tokens: Maximum tokens in a single message. min_message_tokens: Minimum tokens in a single message. token_distribution: Dict mapping role to total tokens for that role. Examples: >>> from lexilux import ChatHistory, Tokenizer >>> tokenizer = Tokenizer("Qwen/Qwen2.5-7B-Instruct") >>> history = ChatHistory.from_messages("Hello") >>> analysis = history.analyze_tokens(tokenizer) >>> print(f"Total tokens: {analysis.total_tokens}") >>> print(f"User tokens: {analysis.user_tokens}") >>> print(f"Assistant tokens: {analysis.assistant_tokens}") """ total_tokens: int system_tokens: int user_tokens: int assistant_tokens: int total_messages: int system_messages: int user_messages: int assistant_messages: int per_message: list[tuple[str, str, int]] # (role, content_preview, tokens) per_round: list[tuple[int, int, int, int]] # (round_index, total, user, assistant) average_tokens_per_message: float average_tokens_per_round: float max_message_tokens: int min_message_tokens: int token_distribution: dict[str, int]
[docs] def __repr__(self) -> str: """Return string representation.""" return ( f"TokenAnalysis(total={self.total_tokens}, " f"user={self.user_tokens}, assistant={self.assistant_tokens}, " f"rounds={len(self.per_round)})" )
[docs] def to_dict(self) -> dict[str, Any]: """Convert to dictionary for serialization.""" return { "total_tokens": self.total_tokens, "system_tokens": self.system_tokens, "user_tokens": self.user_tokens, "assistant_tokens": self.assistant_tokens, "total_messages": self.total_messages, "system_messages": self.system_messages, "user_messages": self.user_messages, "assistant_messages": self.assistant_messages, "per_message": [ {"role": role, "content_preview": preview, "tokens": tokens} for role, preview, tokens in self.per_message ], "per_round": [ { "round_index": idx, "total_tokens": total, "user_tokens": user, "assistant_tokens": assistant, } for idx, total, user, assistant in self.per_round ], "average_tokens_per_message": self.average_tokens_per_message, "average_tokens_per_round": self.average_tokens_per_round, "max_message_tokens": self.max_message_tokens, "min_message_tokens": self.min_message_tokens, "token_distribution": self.token_distribution, }
[docs] class ChatHistory(MutableSequence): """ Conversation history manager. Implements MutableSequence protocol, allowing array-like operations: - Index access: history[0] - Slicing: history[1:5] (returns new ChatHistory) - Iteration: for msg in history - Length: len(history) - Membership: msg in history ChatHistory can be automatically built from messages or Chat results, eliminating the need for manual history maintenance. Examples: # Auto-extract from Chat call >>> result = chat("Hello") >>> history = ChatHistory.from_chat_result("Hello", result) # Auto-extract from messages >>> messages = [{"role": "user", "content": "Hello"}] >>> history = ChatHistory.from_messages(messages) # Manual construction (optional) >>> history = ChatHistory(system="You are helpful") >>> history.add_user("What is Python?") >>> result = chat(history.get_messages()) >>> history.append_result(result) # Array-like operations >>> msg = history[0] # Get first message >>> first_3 = history[:3] # Get first 3 messages (new ChatHistory) >>> for msg in history: # Iterate ... print(msg) >>> len(history) # Get length >>> msg in history # Check membership """ __slots__ = ("system", "messages", "metadata")
[docs] def __init__( self, messages: list[dict[str, str]] | None = None, system: str | None = None, ): """ Initialize conversation history. Args: messages: Message list (optional, can be extracted from anywhere). system: System message (optional). Note: The messages list is deep copied to prevent external modifications. """ self.system = system # Deep copy to prevent external modifications to nested dicts self.messages: list[dict[str, str]] = deepcopy(messages or []) self.metadata: dict[str, Any] = {} # Metadata (timestamps, model, etc.)
@classmethod def _from_trusted( cls, messages: list[dict[str, str]], system: str | None = None, ) -> ChatHistory: """ Internal constructor for trusted data (skips deepcopy). Use only when messages are already safe (e.g., from clone()). This is an internal method - external callers should use __init__. """ instance = object.__new__(cls) instance.system = system instance.messages = messages # No copy - caller guarantees safety instance.metadata = {} return instance
[docs] @classmethod def from_messages( cls, messages: MessagesLike, system: str | None = None ) -> ChatHistory: """ Automatically build from message list (supports all Chat-supported formats). Args: messages: Messages in various formats (str, list of str, list of dict). system: Optional system message. Returns: ChatHistory instance. Examples: >>> history = ChatHistory.from_messages("Hello") >>> history = ChatHistory.from_messages([{"role": "user", "content": "Hello"}]) """ normalized = normalize_messages(messages, system=system) # Extract system message(s) if present # Only extract the first system message, keep others in messages sys_msg = None if normalized and normalized[0].get("role") == "system": sys_msg = normalized[0]["content"] normalized = normalized[1:] # Use _from_trusted since normalized is a fresh list from normalize_messages return cls._from_trusted(messages=normalized, system=sys_msg)
[docs] @classmethod def from_chat_result( cls, messages: MessagesLike, result: ChatResult ) -> ChatHistory: """ Automatically build complete history from Chat call and result. Args: messages: Messages sent to Chat (supports all formats). result: ChatResult from the API call. Returns: ChatHistory instance with complete conversation. Examples: >>> result = chat("Hello") >>> history = ChatHistory.from_chat_result("Hello", result) """ normalized = normalize_messages(messages) # Extract system message if present sys_msg = None if normalized and normalized[0].get("role") == "system": sys_msg = normalized[0]["content"] normalized = normalized[1:] # Add assistant response - normalized.copy() creates a new list history_messages = normalized.copy() history_messages.append({"role": "assistant", "content": result.text}) # Use _from_trusted since history_messages is a fresh list return cls._from_trusted(messages=history_messages, system=sys_msg)
[docs] @classmethod def from_dict(cls, data: dict) -> ChatHistory: """ Deserialize from dictionary. Args: data: Dictionary containing history data. Returns: ChatHistory instance. """ return cls( messages=data.get("messages", []), system=data.get("system"), )
[docs] @classmethod def from_json(cls, json_str: str) -> ChatHistory: """ Deserialize from JSON string. Args: json_str: JSON string containing history data. Returns: ChatHistory instance. """ data = json.loads(json_str) return cls.from_dict(data)
[docs] def add_user(self, content: str) -> None: """Add user message.""" self.messages.append({"role": "user", "content": content})
[docs] def add_assistant(self, content: str) -> None: """Add assistant message.""" self.messages.append({"role": "assistant", "content": content})
[docs] def add_message(self, role: str, content: str) -> None: """Add message with specified role.""" self.messages.append({"role": role, "content": content})
[docs] def add_system(self, content: str) -> None: """Add system message (updates system attribute).""" self.system = content
[docs] def remove_last(self) -> dict[str, str] | None: """ Remove and return the last message. Returns: The removed message dict, or None if history is empty. """ if not self.messages: return None return self.messages.pop()
[docs] def remove_at(self, index: int) -> dict[str, str] | None: """ Remove and return message at specified index. Args: index: Index of message to remove. Returns: The removed message dict, or None if index is out of range. """ if 0 <= index < len(self.messages): return self.messages.pop(index) return None
[docs] def replace_at(self, index: int, role: str, content: str) -> None: """ Replace message at specified index. Args: index: Index of message to replace. role: New role. content: New content. Raises: IndexError: If index is out of range. """ if not (0 <= index < len(self.messages)): raise IndexError( f"Index {index} out of range for history with {len(self.messages)} messages" ) self.messages[index] = {"role": role, "content": content}
[docs] def get_user_messages(self) -> list[str]: """ Get all user messages. Returns: List of user message contents. """ return [msg["content"] for msg in self.messages if msg.get("role") == "user"]
[docs] def get_assistant_messages(self) -> list[str]: """ Get all assistant messages. Returns: List of assistant message contents. """ return [ msg["content"] for msg in self.messages if msg.get("role") == "assistant" ]
[docs] def get_last_message(self) -> dict[str, str] | None: """ Get the last message. Returns: Last message dict, or None if history is empty. """ return self.messages[-1] if self.messages else None
[docs] def get_last_user_message(self) -> str | None: """ Get the last user message content. Returns: Last user message content, or None if no user messages exist. """ for msg in reversed(self.messages): if msg.get("role") == "user": return msg.get("content", "") return None
[docs] def clone(self) -> ChatHistory: """ Create a deep copy of this history. Returns: New ChatHistory instance with copied messages. """ # Use _from_trusted to avoid redundant deepcopy return ChatHistory._from_trusted( messages=[msg.copy() for msg in self.messages], system=self.system, )
[docs] def clear(self) -> None: """Clear all messages (keep system message).""" self.messages = []
[docs] def get_messages(self, include_system: bool = True) -> list[dict[str, str]]: """ Get messages list. Args: include_system: Whether to include system message. Returns: List of message dictionaries. """ result = [] if include_system and self.system: result.append({"role": "system", "content": self.system}) result.extend(self.messages) return result
[docs] def to_dict(self) -> dict[str, Any]: """ Serialize to dictionary. Returns: Dictionary containing history data. """ return { "system": self.system, "messages": self.messages, "metadata": self.metadata, }
[docs] def to_json(self, **kwargs) -> str: """ Serialize to JSON string. Args: **kwargs: Additional arguments for json.dumps. Returns: JSON string. """ return json.dumps(self.to_dict(), **kwargs)
[docs] def count_tokens(self, tokenizer: Tokenizer) -> int: """ Count total tokens in history. This is a convenience method that returns only the total token count. For detailed analysis, use :meth:`analyze_tokens` instead. Args: tokenizer: Tokenizer instance. Returns: Total token count across all messages (including system message). Examples: >>> from lexilux import ChatHistory, Tokenizer >>> tokenizer = Tokenizer("Qwen/Qwen2.5-7B-Instruct") >>> history = ChatHistory.from_messages("Hello") >>> total = history.count_tokens(tokenizer) >>> print(f"Total tokens: {total}") See Also: :meth:`analyze_tokens` - For detailed token analysis """ messages = self.get_messages(include_system=True) total = 0 for msg in messages: content = msg.get("content", "") result = tokenizer(content) total += result.usage.total_tokens or 0 return total
[docs] def count_tokens_per_round(self, tokenizer: Tokenizer) -> list[tuple[int, int]]: """ Count tokens per round. This method returns a simple list of (round_index, total_tokens) tuples. For more detailed per-round analysis (including user/assistant breakdown), use :meth:`analyze_tokens` instead. Args: tokenizer: Tokenizer instance. Returns: List of (round_index, total_tokens) tuples, where round_index is 0-based. Examples: >>> from lexilux import ChatHistory, Tokenizer >>> tokenizer = Tokenizer("Qwen/Qwen2.5-7B-Instruct") >>> history = ChatHistory.from_messages("Hello") >>> history.add_assistant("Hi!") >>> round_tokens = history.count_tokens_per_round(tokenizer) >>> for idx, tokens in round_tokens: ... print(f"Round {idx}: {tokens} tokens") See Also: :meth:`analyze_tokens` - For detailed per-round analysis with role breakdown """ return self._count_tokens_per_round_with_rounds(tokenizer, self._get_rounds())
def _count_tokens_per_round_with_rounds( self, tokenizer: Tokenizer, rounds: list[list[dict[str, str]]], ) -> list[tuple[int, int]]: """Internal helper that accepts pre-computed rounds to avoid recomputation.""" result = [] for idx, round_messages in enumerate(rounds): round_tokens = 0 for msg in round_messages: content = msg.get("content", "") token_result = tokenizer(content) round_tokens += token_result.usage.total_tokens or 0 result.append((idx, round_tokens)) return result
[docs] def count_tokens_by_role(self, tokenizer: Tokenizer) -> dict[str, int]: """ Count tokens grouped by role (system, user, assistant). Args: tokenizer: Tokenizer instance. Returns: Dictionary mapping role to total token count for that role. Keys: "system", "user", "assistant" Examples: >>> from lexilux import ChatHistory, Tokenizer >>> tokenizer = Tokenizer("Qwen/Qwen2.5-7B-Instruct") >>> history = ChatHistory(system="You are helpful") >>> history.add_user("Hello") >>> history.add_assistant("Hi!") >>> role_tokens = history.count_tokens_by_role(tokenizer) >>> print(f"User tokens: {role_tokens['user']}") >>> print(f"Assistant tokens: {role_tokens['assistant']}") """ messages = self.get_messages(include_system=True) role_tokens: dict[str, int] = {"system": 0, "user": 0, "assistant": 0} for msg in messages: role = msg.get("role", "") content = msg.get("content", "") result = tokenizer(content) tokens = result.usage.total_tokens or 0 if role in role_tokens: role_tokens[role] += tokens return role_tokens
[docs] def analyze_tokens(self, tokenizer: Tokenizer) -> TokenAnalysis: """ Perform comprehensive token analysis on conversation history. This method provides detailed token statistics including: - Total tokens and breakdown by role - Per-message token counts with content previews - Per-round analysis with user/assistant breakdown - Statistical metrics (averages, min, max) - Token distribution by role Args: tokenizer: Tokenizer instance. Returns: TokenAnalysis object containing comprehensive token statistics. Examples: Basic usage: >>> from lexilux import ChatHistory, Tokenizer >>> tokenizer = Tokenizer("Qwen/Qwen2.5-7B-Instruct") >>> history = ChatHistory(system="You are helpful") >>> history.add_user("What is Python?") >>> history.add_assistant("Python is a programming language.") >>> analysis = history.analyze_tokens(tokenizer) >>> print(f"Total: {analysis.total_tokens}") >>> print(f"User: {analysis.user_tokens}, Assistant: {analysis.assistant_tokens}") Detailed analysis: >>> analysis = history.analyze_tokens(tokenizer) >>> # Per-message breakdown >>> for role, preview, tokens in analysis.per_message: ... print(f"{role}: {preview[:30]}... ({tokens} tokens)") >>> # Per-round breakdown >>> for idx, total, user, assistant in analysis.per_round: ... print(f"Round {idx}: total={total}, user={user}, assistant={assistant}") >>> # Distribution >>> print(f"Distribution: {analysis.token_distribution}") Export analysis: >>> analysis_dict = analysis.to_dict() >>> import json >>> print(json.dumps(analysis_dict, indent=2)) """ messages = self.get_messages(include_system=True) rounds = self._get_rounds() # Initialize counters total_tokens = 0 system_tokens = 0 user_tokens = 0 assistant_tokens = 0 system_count = 0 user_count = 0 assistant_count = 0 # Per-message analysis - also build token cache for per-round analysis per_message: list[tuple[str, str, int]] = [] message_tokens_list: list[int] = [] # Cache: id(msg) -> tokens to avoid re-tokenization in per-round analysis msg_token_cache: dict[int, int] = {} for msg in messages: role = msg.get("role", "") content = msg.get("content", "") result = tokenizer(content) tokens = result.usage.total_tokens or 0 # Cache by message id for reuse in per-round analysis msg_token_cache[id(msg)] = tokens total_tokens += tokens message_tokens_list.append(tokens) # Content preview (first 50 chars) preview = content[:50] + "..." if len(content) > 50 else content per_message.append((role, preview, tokens)) # Count by role if role == "system": system_tokens += tokens system_count += 1 elif role == "user": user_tokens += tokens user_count += 1 elif role == "assistant": assistant_tokens += tokens assistant_count += 1 # Per-round analysis - use cached token counts per_round: list[tuple[int, int, int, int]] = [] round_tokens_list: list[int] = [] for idx, round_messages in enumerate(rounds): round_total = 0 round_user = 0 round_assistant = 0 for msg in round_messages: role = msg.get("role", "") # Use cached token count instead of re-tokenizing tokens = msg_token_cache.get(id(msg), 0) round_total += tokens if role == "user": round_user += tokens elif role == "assistant": round_assistant += tokens per_round.append((idx, round_total, round_user, round_assistant)) round_tokens_list.append(round_total) # Calculate statistics avg_per_message = total_tokens / len(messages) if messages else 0.0 avg_per_round = ( sum(round_tokens_list) / len(round_tokens_list) if round_tokens_list else 0.0 ) max_message = max(message_tokens_list) if message_tokens_list else 0 min_message = min(message_tokens_list) if message_tokens_list else 0 # Token distribution token_distribution = { "system": system_tokens, "user": user_tokens, "assistant": assistant_tokens, } return TokenAnalysis( total_tokens=total_tokens, system_tokens=system_tokens, user_tokens=user_tokens, assistant_tokens=assistant_tokens, total_messages=len(messages), system_messages=system_count, user_messages=user_count, assistant_messages=assistant_count, per_message=per_message, per_round=per_round, average_tokens_per_message=round(avg_per_message, 2), average_tokens_per_round=round(avg_per_round, 2), max_message_tokens=max_message, min_message_tokens=min_message, token_distribution=token_distribution, )
[docs] def truncate_by_rounds( self, tokenizer: Tokenizer, max_tokens: int, keep_system: bool = True, ) -> ChatHistory: """ Truncate by rounds, keeping the most recent rounds within max_tokens limit. Args: tokenizer: Tokenizer instance. max_tokens: Maximum token count. keep_system: Whether to keep system message. Returns: New ChatHistory instance (does not modify original). """ rounds = self._get_rounds() if not rounds: return ChatHistory._from_trusted( messages=[], system=self.system if keep_system else None ) # Count tokens per round - pass rounds to avoid recomputation round_tokens = self._count_tokens_per_round_with_rounds(tokenizer, rounds) system_tokens = 0 if keep_system and self.system: sys_result = tokenizer(self.system) system_tokens = sys_result.usage.total_tokens or 0 # Keep rounds from the end until we exceed max_tokens kept_rounds: list[list[dict[str, str]]] = [] current_tokens = system_tokens for idx in range(len(rounds) - 1, -1, -1): round_token_count = round_tokens[idx][1] if current_tokens + round_token_count <= max_tokens: kept_rounds.insert(0, rounds[idx]) current_tokens += round_token_count else: break # Rebuild messages - copy dicts for safety new_messages = [msg.copy() for round_msgs in kept_rounds for msg in round_msgs] return ChatHistory._from_trusted( messages=new_messages, system=self.system if keep_system else None, )
[docs] def get_last_n_rounds(self, n: int) -> ChatHistory: """ Get last N rounds. Args: n: Number of rounds to get. Returns: New ChatHistory instance with last N rounds. """ rounds = self._get_rounds() if not rounds: return ChatHistory._from_trusted(messages=[], system=self.system) last_rounds = rounds[-n:] if n > 0 else [] # Copy dicts for safety new_messages = [msg.copy() for round_msgs in last_rounds for msg in round_msgs] return ChatHistory._from_trusted(messages=new_messages, system=self.system)
[docs] def remove_last_round(self) -> None: """Remove the last round (user + assistant pair).""" rounds = self._get_rounds() if not rounds: return last_round = rounds[-1] for msg in last_round: if msg in self.messages: self.messages.remove(msg)
[docs] def append_result(self, result: ChatResult) -> None: """Append ChatResult as assistant message.""" self.add_assistant(result.text)
[docs] def update_last_assistant(self, content: str) -> None: """Update the last assistant message content (useful for continue scenarios).""" # Find last assistant message for i in range(len(self.messages) - 1, -1, -1): if self.messages[i].get("role") == "assistant": self.messages[i]["content"] = content return # If no assistant message found, add one self.add_assistant(content)
# MutableSequence protocol implementation
[docs] def __len__(self) -> int: """Return the number of messages.""" return len(self.messages)
[docs] def __getitem__(self, key: int | slice) -> dict[str, str] | ChatHistory: """ Get message(s) by index or slice. Args: key: Index (int) or slice. Returns: Single message dict (index) or new ChatHistory instance (slice). Examples: >>> history[0] # Get first message >>> history[1:3] # Get messages at index 1-2, returns new ChatHistory >>> history[:5] # Get first 5 messages >>> history[-3:] # Get last 3 messages """ if isinstance(key, int): return self.messages[key] elif isinstance(key, slice): return ChatHistory( messages=self.messages[key].copy(), system=self.system, ) else: raise TypeError(f"Invalid key type: {type(key)}")
[docs] def __setitem__( self, key: int | slice, value: dict[str, str] | Sequence[dict[str, str]] ) -> None: """ Set message(s) by index or slice. Args: key: Index (int) or slice. value: Single message dict (index) or sequence of message dicts (slice). Raises: TypeError: If value type is invalid. """ if isinstance(key, int): if not isinstance(value, dict): raise TypeError("Value must be a dict") self.messages[key] = value elif isinstance(key, slice): if not isinstance(value, (list, tuple)): raise TypeError("Value must be a list or tuple of dicts") self.messages[key] = list(value) else: raise TypeError(f"Invalid key type: {type(key)}")
[docs] def __delitem__(self, key: int | slice) -> None: """ Delete message(s) by index or slice. Args: key: Index (int) or slice. """ del self.messages[key]
[docs] def insert(self, index: int, value: dict[str, str]) -> None: """ Insert message at specified index. Args: index: Index to insert at. value: Message dict to insert. Raises: TypeError: If value is not a dict. """ if not isinstance(value, dict): raise TypeError("Value must be a dict") self.messages.insert(index, value)
[docs] def __iter__(self): """Iterate over messages.""" return iter(self.messages)
[docs] def __contains__(self, item) -> bool: """Check if message is in history.""" return item in self.messages
[docs] def __add__(self, other: ChatHistory) -> ChatHistory: """ Merge two histories (concatenate messages). Args: other: Another ChatHistory instance. Returns: New ChatHistory instance with merged messages. System message from self is used. Examples: >>> history1 = ChatHistory.from_messages("Hello") >>> history2 = ChatHistory.from_messages("How are you?") >>> combined = history1 + history2 """ return ChatHistory( messages=self.messages + other.messages, system=self.system, # Use self's system message )
[docs] def __repr__(self) -> str: """Return string representation.""" return f"ChatHistory(messages={len(self.messages)}, system={self.system is not None})"
def _get_rounds(self) -> list[list[dict[str, str]]]: """ Get conversation rounds (user + assistant pairs). Returns: List of rounds, each round is a list of messages. """ rounds = [] current_round = [] for msg in self.messages: role = msg.get("role") if role == "user": # Start new round if current_round: rounds.append(current_round) current_round = [msg] elif role == "assistant": # Add to current round current_round.append(msg) # Round complete rounds.append(current_round) current_round = [] # Add incomplete round if exists if current_round: rounds.append(current_round) return rounds
# Utility functions for ChatHistory operations def merge_histories(*histories: ChatHistory) -> ChatHistory: """ Merge multiple conversation histories into one. Args: *histories: Multiple ChatHistory instances to merge. Returns: New ChatHistory instance with merged messages. Examples: >>> history1 = ChatHistory.from_messages("Hello") >>> history2 = ChatHistory.from_messages("How are you?") >>> merged = merge_histories(history1, history2) """ if not histories: return ChatHistory() # Use first history's system message (if any) system = histories[0].system # Merge all messages merged_messages = [] for history in histories: merged_messages.extend(history.messages) return ChatHistory(messages=merged_messages, system=system) def filter_by_role(history: ChatHistory, role: str) -> ChatHistory: """ Filter history by role. Args: history: ChatHistory instance to filter. role: Role to filter by ("user", "assistant", "system", "tool"). Returns: New ChatHistory instance with filtered messages. Examples: >>> history = ChatHistory.from_messages(["Hello", "Hi there"]) >>> user_only = filter_by_role(history, "user") """ filtered_messages = [msg for msg in history.messages if msg.get("role") == role] # Include system message if filtering by system role system = history.system if role == "system" else None return ChatHistory(messages=filtered_messages, system=system) def search_content(history: ChatHistory, pattern: str) -> list[dict[str, str]]: """ Search for messages containing the pattern. Args: history: ChatHistory instance to search. pattern: Search pattern (case-sensitive substring match). Returns: List of message dictionaries that contain the pattern. Examples: >>> history = ChatHistory.from_messages(["Hello world", "Hi there"]) >>> results = search_content(history, "world") >>> # Returns messages containing "world" """ results = [] for msg in history.get_messages(include_system=True): content = msg.get("content", "") if pattern in content: results.append(msg) return results def get_statistics( history: ChatHistory, tokenizer: Tokenizer | None = None ) -> dict[str, Any]: """ Get comprehensive statistics about the conversation history. This function provides both character-based and token-based statistics. If a tokenizer is provided, token statistics are included. Args: history: ChatHistory instance to analyze. tokenizer: Optional Tokenizer instance for token statistics. If provided, includes detailed token analysis. Returns: Dictionary with statistics: - total_rounds: Number of conversation rounds - total_messages: Total number of messages - user_messages: Number of user messages - assistant_messages: Number of assistant messages - system_messages: Number of system messages (0 or 1) - has_system: Whether system message exists - total_characters: Total characters across all messages - average_message_length: Average length of messages (characters) - token_analysis: TokenAnalysis object (if tokenizer provided) - total_tokens: Total tokens (if tokenizer provided) - tokens_by_role: Dict of tokens by role (if tokenizer provided) Examples: Basic statistics (character-based): >>> history = ChatHistory.from_messages(["Hello", "Hi"]) >>> stats = get_statistics(history) >>> print(stats["total_rounds"]) With token analysis: >>> from lexilux import Tokenizer >>> tokenizer = Tokenizer("Qwen/Qwen2.5-7B-Instruct") >>> stats = get_statistics(history, tokenizer=tokenizer) >>> print(f"Total tokens: {stats['total_tokens']}") >>> print(f"User tokens: {stats['tokens_by_role']['user']}") """ messages = history.get_messages(include_system=True) rounds = history._get_rounds() user_count = sum(1 for msg in messages if msg.get("role") == "user") assistant_count = sum(1 for msg in messages if msg.get("role") == "assistant") system_count = 1 if history.system else 0 total_length = sum(len(msg.get("content", "")) for msg in messages) avg_length = total_length / len(messages) if messages else 0 stats: dict[str, Any] = { "total_rounds": len(rounds), "total_messages": len(messages), "user_messages": user_count, "assistant_messages": assistant_count, "system_messages": system_count, "has_system": history.system is not None, "total_characters": total_length, "average_message_length": round(avg_length, 2), } # Add token statistics if tokenizer provided if tokenizer is not None: analysis = history.analyze_tokens(tokenizer) stats["token_analysis"] = analysis stats["total_tokens"] = analysis.total_tokens stats["tokens_by_role"] = analysis.token_distribution stats["average_tokens_per_message"] = analysis.average_tokens_per_message stats["average_tokens_per_round"] = analysis.average_tokens_per_round stats["max_message_tokens"] = analysis.max_message_tokens stats["min_message_tokens"] = analysis.min_message_tokens return stats