sherpa_ai.models package

In This Page:

sherpa_ai.models package#

Submodules#

sherpa_ai.models.chat_model_with_logging module#

class sherpa_ai.models.chat_model_with_logging.ChatModelWithLogging(*args: ~typing.Any, name: str | None = None, cache: ~langchain_core.caches.BaseCache | bool | None = None, verbose: bool = <factory>, callbacks: list[~langchain_core.callbacks.base.BaseCallbackHandler] | ~langchain_core.callbacks.base.BaseCallbackManager | None = None, tags: list[str] | None = None, metadata: dict[str, ~typing.Any] | None = None, custom_get_token_ids: ~typing.Callable[[str], list[int]] | None = None, callback_manager: ~langchain_core.callbacks.base.BaseCallbackManager | None = None, rate_limiter: ~langchain_core.rate_limiters.BaseRateLimiter | None = None, disable_streaming: bool | ~typing.Literal['tool_calling'] = False, llm: ~langchain_core.language_models.chat_models.BaseChatModel, logger: ~loguru._logger.Logger)[source]#

Bases: BaseChatModel

llm: BaseChatModel#
logger: Logger#
model_config: ClassVar[ConfigDict] = {'arbitrary_types_allowed': True, 'extra': 'ignore', 'protected_namespaces': ()}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

sherpa_ai.models.sherpa_base_chat_model module#

class sherpa_ai.models.sherpa_base_chat_model.SherpaBaseChatModel(*args: ~typing.Any, name: str | None = None, cache: ~langchain_core.caches.BaseCache | bool | None = None, verbose: bool = <factory>, callbacks: list[~langchain_core.callbacks.base.BaseCallbackHandler] | ~langchain_core.callbacks.base.BaseCallbackManager | None = None, tags: list[str] | None = None, metadata: dict[str, ~typing.Any] | None = None, custom_get_token_ids: ~typing.Callable[[str], list[int]] | None = None, callback_manager: ~langchain_core.callbacks.base.BaseCallbackManager | None = None, rate_limiter: ~langchain_core.rate_limiters.BaseRateLimiter | None = None, disable_streaming: bool | ~typing.Literal['tool_calling'] = False, user_id: str | None = None, verbose_logger: ~sherpa_ai.verbose_loggers.base.BaseVerboseLogger = None)[source]#

Bases: BaseChatModel

model_config: ClassVar[ConfigDict] = {'arbitrary_types_allowed': True, 'extra': 'ignore', 'protected_namespaces': ()}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

user_id: str | None#
verbose_logger: BaseVerboseLogger#
class sherpa_ai.models.sherpa_base_chat_model.SherpaChatOpenAI(*args: ~typing.Any, name: str | None = None, cache: ~langchain_core.caches.BaseCache | bool | None = None, verbose: bool = <factory>, callbacks: list[~langchain_core.callbacks.base.BaseCallbackHandler] | ~langchain_core.callbacks.base.BaseCallbackManager | None = None, tags: list[str] | None = None, metadata: dict[str, ~typing.Any] | None = None, custom_get_token_ids: ~typing.Callable[[str], list[int]] | None = None, callback_manager: ~langchain_core.callbacks.base.BaseCallbackManager | None = None, rate_limiter: ~langchain_core.rate_limiters.BaseRateLimiter | None = None, disable_streaming: bool | ~typing.Literal['tool_calling'] = False, client: ~typing.Any = None, async_client: ~typing.Any = None, root_client: ~typing.Any = None, root_async_client: ~typing.Any = None, model: str = 'gpt-3.5-turbo', temperature: float = 0.7, model_kwargs: ~typing.Dict[str, ~typing.Any] = <factory>, api_key: ~pydantic.types.SecretStr | None = <factory>, base_url: str | None = None, organization: str | None = None, openai_proxy: str | None = <factory>, timeout: float | ~typing.Tuple[float, float] | ~typing.Any | None = None, max_retries: int = 2, presence_penalty: float | None = None, frequency_penalty: float | None = None, seed: int | None = None, logprobs: bool | None = None, top_logprobs: int | None = None, logit_bias: ~typing.Dict[int, int] | None = None, streaming: bool = False, n: int = 1, top_p: float | None = None, max_completion_tokens: int | None = None, reasoning_effort: str | None = None, tiktoken_model_name: str | None = None, default_headers: ~typing.Mapping[str, str] | None = None, default_query: ~typing.Mapping[str, object] | None = None, http_client: ~typing.Any | None = None, http_async_client: ~typing.Any | None = None, stop_sequences: ~typing.List[str] | str | None = None, extra_body: ~typing.Mapping[str, ~typing.Any] | None = None, include_response_headers: bool = False, disabled_params: ~typing.Dict[str, ~typing.Any] | None = None, stream_usage: bool = False, user_id: str | None = None, verbose_logger: ~sherpa_ai.verbose_loggers.base.BaseVerboseLogger = None)[source]#

Bases: ChatOpenAI

model_config: ClassVar[ConfigDict] = {'arbitrary_types_allowed': True, 'extra': 'ignore', 'populate_by_name': True, 'protected_namespaces': ()}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

user_id: str | None#
verbose_logger: BaseVerboseLogger#

sherpa_ai.models.sherpa_base_model module#

class sherpa_ai.models.sherpa_base_model.SherpaOpenAI(*args: ~typing.Any, name: str | None = None, cache: ~langchain_core.caches.BaseCache | bool | None = None, verbose: bool = <factory>, callbacks: list[~langchain_core.callbacks.base.BaseCallbackHandler] | ~langchain_core.callbacks.base.BaseCallbackManager | None = None, tags: list[str] | None = None, metadata: dict[str, ~typing.Any] | None = None, custom_get_token_ids: ~typing.Callable[[str], list[int]] | None = None, callback_manager: ~langchain_core.callbacks.base.BaseCallbackManager | None = None, rate_limiter: ~langchain_core.rate_limiters.BaseRateLimiter | None = None, disable_streaming: bool | ~typing.Literal['tool_calling'] = False, client: ~typing.Any = None, async_client: ~typing.Any = None, root_client: ~typing.Any = None, root_async_client: ~typing.Any = None, model: str = 'gpt-3.5-turbo', temperature: float = 0.7, model_kwargs: ~typing.Dict[str, ~typing.Any] = <factory>, api_key: ~pydantic.types.SecretStr | None = <factory>, base_url: str | None = None, organization: str | None = None, openai_proxy: str | None = <factory>, timeout: float | ~typing.Tuple[float, float] | ~typing.Any | None = None, max_retries: int = 2, presence_penalty: float | None = None, frequency_penalty: float | None = None, seed: int | None = None, logprobs: bool | None = None, top_logprobs: int | None = None, logit_bias: ~typing.Dict[int, int] | None = None, streaming: bool = False, n: int = 1, top_p: float | None = None, max_completion_tokens: int | None = None, reasoning_effort: str | None = None, tiktoken_model_name: str | None = None, default_headers: ~typing.Mapping[str, str] | None = None, default_query: ~typing.Mapping[str, object] | None = None, http_client: ~typing.Any | None = None, http_async_client: ~typing.Any | None = None, stop_sequences: ~typing.List[str] | str | None = None, extra_body: ~typing.Mapping[str, ~typing.Any] | None = None, include_response_headers: bool = False, disabled_params: ~typing.Dict[str, ~typing.Any] | None = None, stream_usage: bool = False, user_id: str | None = None)[source]#

Bases: ChatOpenAI

model_config: ClassVar[ConfigDict] = {'arbitrary_types_allowed': True, 'extra': 'ignore', 'populate_by_name': True, 'protected_namespaces': ()}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

user_id: str | None#

Module contents#

class sherpa_ai.models.SherpaChatOpenAI(*args: ~typing.Any, name: str | None = None, cache: ~langchain_core.caches.BaseCache | bool | None = None, verbose: bool = <factory>, callbacks: list[~langchain_core.callbacks.base.BaseCallbackHandler] | ~langchain_core.callbacks.base.BaseCallbackManager | None = None, tags: list[str] | None = None, metadata: dict[str, ~typing.Any] | None = None, custom_get_token_ids: ~typing.Callable[[str], list[int]] | None = None, callback_manager: ~langchain_core.callbacks.base.BaseCallbackManager | None = None, rate_limiter: ~langchain_core.rate_limiters.BaseRateLimiter | None = None, disable_streaming: bool | ~typing.Literal['tool_calling'] = False, client: ~typing.Any = None, async_client: ~typing.Any = None, root_client: ~typing.Any = None, root_async_client: ~typing.Any = None, model: str = 'gpt-3.5-turbo', temperature: float = 0.7, model_kwargs: ~typing.Dict[str, ~typing.Any] = <factory>, api_key: ~pydantic.types.SecretStr | None = <factory>, base_url: str | None = None, organization: str | None = None, openai_proxy: str | None = <factory>, timeout: float | ~typing.Tuple[float, float] | ~typing.Any | None = None, max_retries: int = 2, presence_penalty: float | None = None, frequency_penalty: float | None = None, seed: int | None = None, logprobs: bool | None = None, top_logprobs: int | None = None, logit_bias: ~typing.Dict[int, int] | None = None, streaming: bool = False, n: int = 1, top_p: float | None = None, max_completion_tokens: int | None = None, reasoning_effort: str | None = None, tiktoken_model_name: str | None = None, default_headers: ~typing.Mapping[str, str] | None = None, default_query: ~typing.Mapping[str, object] | None = None, http_client: ~typing.Any | None = None, http_async_client: ~typing.Any | None = None, stop_sequences: ~typing.List[str] | str | None = None, extra_body: ~typing.Mapping[str, ~typing.Any] | None = None, include_response_headers: bool = False, disabled_params: ~typing.Dict[str, ~typing.Any] | None = None, stream_usage: bool = False, user_id: str | None = None, verbose_logger: ~sherpa_ai.verbose_loggers.base.BaseVerboseLogger = None)[source]#

Bases: ChatOpenAI

cache: BaseCache | bool | None#

Whether to cache the response.

  • If true, will use the global cache.

  • If false, will not use a cache

  • If None, will use the global cache if it’s set, otherwise no cache.

  • If instance of BaseCache, will use the provided cache.

Caching is not currently supported for streaming methods of models.

callback_manager: BaseCallbackManager | None#
callbacks: Callbacks#

Callbacks to add to the run trace.

custom_get_token_ids: Callable[[str], list[int]] | None#

Optional encoder to use for counting tokens.

default_headers: Mapping[str, str] | None#
default_query: Mapping[str, object] | None#
disable_streaming: bool | Literal['tool_calling']#

Whether to disable streaming for this model.

If streaming is bypassed, then stream()/astream()/astream_events() will defer to invoke()/ainvoke().

  • If True, will always bypass streaming case.

  • If “tool_calling”, will bypass streaming case only when the model is called with a tools keyword argument.

  • If False (default), will always use streaming case if available.

disabled_params: Dict[str, Any] | None#

Parameters of the OpenAI client or chat.completions endpoint that should be disabled for the given model.

Should be specified as {"param": None | ['val1', 'val2']} where the key is the parameter and the value is either None, meaning that parameter should never be used, or it’s a list of disabled values for the parameter.

For example, older models may not support the ‘parallel_tool_calls’ parameter at all, in which case disabled_params={"parallel_tool_calls: None} can ben passed in.

If a parameter is disabled then it will not be used by default in any methods, e.g. in with_structured_output(). However this does not prevent a user from directly passed in the parameter during invocation.

extra_body: Mapping[str, Any] | None#

Optional additional JSON properties to include in the request parameters when making requests to OpenAI compatible APIs, such as vLLM.

frequency_penalty: float | None#

Penalizes repeated tokens according to frequency.

http_async_client: Any | None#

Optional httpx.AsyncClient. Only used for async invocations. Must specify http_client as well if you’d like a custom client for sync invocations.

http_client: Any | None#

Optional httpx.Client. Only used for sync invocations. Must specify http_async_client as well if you’d like a custom client for async invocations.

include_response_headers: bool#

Whether to include response headers in the output message response_metadata.

logit_bias: Dict[int, int] | None#

Modify the likelihood of specified tokens appearing in the completion.

logprobs: bool | None#

Whether to return logprobs.

max_retries: int#

Maximum number of retries to make when generating.

max_tokens: int | None#

Maximum number of tokens to generate.

metadata: dict[str, Any] | None#

Metadata to add to the run trace.

model_config: ClassVar[ConfigDict] = {'arbitrary_types_allowed': True, 'extra': 'ignore', 'populate_by_name': True, 'protected_namespaces': ()}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_kwargs: Dict[str, Any]#

Holds any model parameters valid for create call not explicitly specified.

model_name: str#

Model name to use.

n: int#

Number of chat completions to generate for each prompt.

name: str | None#

The name of the Runnable. Used for debugging and tracing.

openai_api_base: str | None#

Base URL path for API requests, leave blank if not using a proxy or service emulator.

openai_api_key: SecretStr | None#
openai_organization: str | None#

Automatically inferred from env var OPENAI_ORG_ID if not provided.

openai_proxy: str | None#
presence_penalty: float | None#

Penalizes repeated tokens.

rate_limiter: BaseRateLimiter | None#

An optional rate limiter to use for limiting the number of requests.

reasoning_effort: str | None#

Constrains effort on reasoning for reasoning models.

o1 models only.

Currently supported values are low, medium, and high. Reducing reasoning effort can result in faster responses and fewer tokens used on reasoning in a response.

Added in version 0.2.14.

request_timeout: float | Tuple[float, float] | Any | None#

Timeout for requests to OpenAI completion API. Can be float, httpx.Timeout or None.

seed: int | None#

Seed for generation

stop: List[str] | str | None#

Default stop sequences.

stream_usage: bool#

Whether to include usage metadata in streaming output. If True, additional message chunks will be generated during the stream including usage metadata.

streaming: bool#

Whether to stream the results or not.

tags: list[str] | None#

Tags to add to the run trace.

temperature: float#

What sampling temperature to use.

tiktoken_model_name: str | None#

The model name to pass to tiktoken when using this class. Tiktoken is used to count the number of tokens in documents to constrain them to be under a certain limit. By default, when set to None, this will be the same as the embedding model name. However, there are some cases where you may want to use this Embedding class with a model name not supported by tiktoken. This can include when using Azure embeddings or when using one of the many model providers that expose an OpenAI-like API but with different models. In those cases, in order to avoid erroring when tiktoken is called, you can specify a model name to use here.

top_logprobs: int | None#

Number of most likely tokens to return at each token position, each with an associated log probability. logprobs must be set to true if this parameter is used.

top_p: float | None#

Total probability mass of tokens to consider at each step.

user_id: str | None#
verbose: bool#

Whether to print out response text.

verbose_logger: BaseVerboseLogger#
class sherpa_ai.models.SherpaOpenAI(*args: ~typing.Any, name: str | None = None, cache: ~langchain_core.caches.BaseCache | bool | None = None, verbose: bool = <factory>, callbacks: list[~langchain_core.callbacks.base.BaseCallbackHandler] | ~langchain_core.callbacks.base.BaseCallbackManager | None = None, tags: list[str] | None = None, metadata: dict[str, ~typing.Any] | None = None, custom_get_token_ids: ~typing.Callable[[str], list[int]] | None = None, callback_manager: ~langchain_core.callbacks.base.BaseCallbackManager | None = None, rate_limiter: ~langchain_core.rate_limiters.BaseRateLimiter | None = None, disable_streaming: bool | ~typing.Literal['tool_calling'] = False, client: ~typing.Any = None, async_client: ~typing.Any = None, root_client: ~typing.Any = None, root_async_client: ~typing.Any = None, model: str = 'gpt-3.5-turbo', temperature: float = 0.7, model_kwargs: ~typing.Dict[str, ~typing.Any] = <factory>, api_key: ~pydantic.types.SecretStr | None = <factory>, base_url: str | None = None, organization: str | None = None, openai_proxy: str | None = <factory>, timeout: float | ~typing.Tuple[float, float] | ~typing.Any | None = None, max_retries: int = 2, presence_penalty: float | None = None, frequency_penalty: float | None = None, seed: int | None = None, logprobs: bool | None = None, top_logprobs: int | None = None, logit_bias: ~typing.Dict[int, int] | None = None, streaming: bool = False, n: int = 1, top_p: float | None = None, max_completion_tokens: int | None = None, reasoning_effort: str | None = None, tiktoken_model_name: str | None = None, default_headers: ~typing.Mapping[str, str] | None = None, default_query: ~typing.Mapping[str, object] | None = None, http_client: ~typing.Any | None = None, http_async_client: ~typing.Any | None = None, stop_sequences: ~typing.List[str] | str | None = None, extra_body: ~typing.Mapping[str, ~typing.Any] | None = None, include_response_headers: bool = False, disabled_params: ~typing.Dict[str, ~typing.Any] | None = None, stream_usage: bool = False, user_id: str | None = None)[source]#

Bases: ChatOpenAI

cache: BaseCache | bool | None#

Whether to cache the response.

  • If true, will use the global cache.

  • If false, will not use a cache

  • If None, will use the global cache if it’s set, otherwise no cache.

  • If instance of BaseCache, will use the provided cache.

Caching is not currently supported for streaming methods of models.

callback_manager: BaseCallbackManager | None#
callbacks: Callbacks#

Callbacks to add to the run trace.

custom_get_token_ids: Callable[[str], list[int]] | None#

Optional encoder to use for counting tokens.

default_headers: Mapping[str, str] | None#
default_query: Mapping[str, object] | None#
disable_streaming: bool | Literal['tool_calling']#

Whether to disable streaming for this model.

If streaming is bypassed, then stream()/astream()/astream_events() will defer to invoke()/ainvoke().

  • If True, will always bypass streaming case.

  • If “tool_calling”, will bypass streaming case only when the model is called with a tools keyword argument.

  • If False (default), will always use streaming case if available.

disabled_params: Dict[str, Any] | None#

Parameters of the OpenAI client or chat.completions endpoint that should be disabled for the given model.

Should be specified as {"param": None | ['val1', 'val2']} where the key is the parameter and the value is either None, meaning that parameter should never be used, or it’s a list of disabled values for the parameter.

For example, older models may not support the ‘parallel_tool_calls’ parameter at all, in which case disabled_params={"parallel_tool_calls: None} can ben passed in.

If a parameter is disabled then it will not be used by default in any methods, e.g. in with_structured_output(). However this does not prevent a user from directly passed in the parameter during invocation.

extra_body: Mapping[str, Any] | None#

Optional additional JSON properties to include in the request parameters when making requests to OpenAI compatible APIs, such as vLLM.

frequency_penalty: float | None#

Penalizes repeated tokens according to frequency.

http_async_client: Any | None#

Optional httpx.AsyncClient. Only used for async invocations. Must specify http_client as well if you’d like a custom client for sync invocations.

http_client: Any | None#

Optional httpx.Client. Only used for sync invocations. Must specify http_async_client as well if you’d like a custom client for async invocations.

include_response_headers: bool#

Whether to include response headers in the output message response_metadata.

logit_bias: Dict[int, int] | None#

Modify the likelihood of specified tokens appearing in the completion.

logprobs: bool | None#

Whether to return logprobs.

max_retries: int#

Maximum number of retries to make when generating.

max_tokens: int | None#

Maximum number of tokens to generate.

metadata: dict[str, Any] | None#

Metadata to add to the run trace.

model_config: ClassVar[ConfigDict] = {'arbitrary_types_allowed': True, 'extra': 'ignore', 'populate_by_name': True, 'protected_namespaces': ()}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_kwargs: Dict[str, Any]#

Holds any model parameters valid for create call not explicitly specified.

model_name: str#

Model name to use.

n: int#

Number of chat completions to generate for each prompt.

name: str | None#

The name of the Runnable. Used for debugging and tracing.

openai_api_base: str | None#

Base URL path for API requests, leave blank if not using a proxy or service emulator.

openai_api_key: SecretStr | None#
openai_organization: str | None#

Automatically inferred from env var OPENAI_ORG_ID if not provided.

openai_proxy: str | None#
presence_penalty: float | None#

Penalizes repeated tokens.

rate_limiter: BaseRateLimiter | None#

An optional rate limiter to use for limiting the number of requests.

reasoning_effort: str | None#

Constrains effort on reasoning for reasoning models.

o1 models only.

Currently supported values are low, medium, and high. Reducing reasoning effort can result in faster responses and fewer tokens used on reasoning in a response.

Added in version 0.2.14.

request_timeout: float | Tuple[float, float] | Any | None#

Timeout for requests to OpenAI completion API. Can be float, httpx.Timeout or None.

seed: int | None#

Seed for generation

stop: List[str] | str | None#

Default stop sequences.

stream_usage: bool#

Whether to include usage metadata in streaming output. If True, additional message chunks will be generated during the stream including usage metadata.

streaming: bool#

Whether to stream the results or not.

tags: list[str] | None#

Tags to add to the run trace.

temperature: float#

What sampling temperature to use.

tiktoken_model_name: str | None#

The model name to pass to tiktoken when using this class. Tiktoken is used to count the number of tokens in documents to constrain them to be under a certain limit. By default, when set to None, this will be the same as the embedding model name. However, there are some cases where you may want to use this Embedding class with a model name not supported by tiktoken. This can include when using Azure embeddings or when using one of the many model providers that expose an OpenAI-like API but with different models. In those cases, in order to avoid erroring when tiktoken is called, you can specify a model name to use here.

top_logprobs: int | None#

Number of most likely tokens to return at each token position, each with an associated log probability. logprobs must be set to true if this parameter is used.

top_p: float | None#

Total probability mass of tokens to consider at each step.

user_id: str | None#
verbose: bool#

Whether to print out response text.