ChatCerebras chat model.
Setup:
Install langchain-cerebras and set environment variable CEREBRAS_API_KEY.
pip install -U langchain-cerebras
export CEREBRAS_API_KEY="your-api-key"
Key init args — completion params: model: str Name of model to use. temperature: Optional[float] Sampling temperature. max_tokens: Optional[int] Max number of tokens to generate. reasoning_effort: Optional[Literal["low", "medium", "high"]] Level of reasoning effort for gpt-oss-120b model. disable_reasoning: Optional[bool] Whether to disable reasoning for zai-glm-4.6 model.
Key init args — client params:
timeout: Union[float, Tuple[float, float], Any, None]
Timeout for requests.
max_retries: Optional[int]
Max number of retries.
api_key: Optional[str]
Cerebras API key. If not passed in will be read from env var CEREBRAS_API_KEY.
Instantiate:
from langchain_cerebras import ChatCerebras
llm = ChatCerebras(
model="llama-3.3-70b",
temperature=0,
max_tokens=None,
timeout=None,
max_retries=2,
# api_key="...",
# other params...
)
Invoke:
messages = [
(
"system",
"You are a helpful translator. Translate the user sentence to French.",
),
("human", "I love programming."),
]
llm.invoke(messages)
AIMessage(
content='The translation of "I love programming" to French is:\n\n"J\'adore programmer."',
response_metadata={
'token_usage': {'completion_tokens': 20, 'prompt_tokens': 32, 'total_tokens': 52},
'model_name': 'llama-3.3-70b',
'system_fingerprint': 'fp_679dff74c0',
'finish_reason': 'stop',
},
id='run-377c2887-30ef-417e-b0f5-83efc8844f12-0',
usage_metadata={'input_tokens': 32, 'output_tokens': 20, 'total_tokens': 52})
Stream:
for chunk in llm.stream(messages):
print(chunk)
content='' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content='The' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content=' translation' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content=' of' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content=' "' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content='I' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content=' love' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content=' programming' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content='"' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content=' to' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content=' French' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content=' is' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content=':\n\n' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content='"' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content='J' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content="'" id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content='ad' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content='ore' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content=' programmer' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content='."' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content='' response_metadata={'finish_reason': 'stop', 'model_name': 'llama-3.3-70b', 'system_fingerprint': 'fp_679dff74c0'} id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
Async:
await llm.ainvoke(messages)
# stream:
# async for chunk in (await llm.astream(messages))
# batch:
# await llm.abatch([messages])
AIMessage(
content='The translation of "I love programming" to French is:\n\n"J\'adore programmer."',
response_metadata={
'token_usage': {'completion_tokens': 20, 'prompt_tokens': 32, 'total_tokens': 52},
'model_name': 'llama-3.3-70b',
'system_fingerprint': 'fp_679dff74c0',
'finish_reason': 'stop',
},
id='run-377c2887-30ef-417e-b0f5-83efc8844f12-0',
usage_metadata={'input_tokens': 32, 'output_tokens': 20, 'total_tokens': 52})
Tool calling:
from langchain_core.pydantic_v1 import BaseModel, Field
llm = ChatCerebras(model="llama-3.3-70b")
class GetWeather(BaseModel):
'''Get the current weather in a given location'''
location: str = Field(
..., description="The city and state, e.g. San Francisco, CA"
)
class GetPopulation(BaseModel):
'''Get the current population in a given location'''
location: str = Field(
..., description="The city and state, e.g. San Francisco, CA"
)
llm_with_tools = llm.bind_tools([GetWeather, GetPopulation])
ai_msg = llm_with_tools.invoke(
"Which city is bigger: LA or NY?"
)
ai_msg.tool_calls
[
{
'name': 'GetPopulation',
'args': {'location': 'NY'},
'id': 'call_m5tstyn2004pre9bfuxvom8x',
'type': 'tool_call'
},
{
'name': 'GetPopulation',
'args': {'location': 'LA'},
'id': 'call_0vjgq455gq1av5sp9eb1pw6a',
'type': 'tool_call'
}
]
Structured output:
from typing import Optional
from langchain_core.pydantic_v1 import BaseModel, Field
class Joke(BaseModel):
'''Joke to tell user.'''
setup: str = Field(description="The setup of the joke")
punchline: str = Field(description="The punchline to the joke")
rating: Optional[int] = Field(description="How funny the joke is, from 1 to 10")
structured_llm = llm.with_structured_output(Joke)
structured_llm.invoke("Tell me a joke about cats")
Joke(
setup='Why was the cat sitting on the computer?',
punchline='To keep an eye on the mouse!',
rating=7
)
JSON mode:
json_llm = llm.bind(response_format={"type": "json_object"})
ai_msg = json_llm.invoke(
"Return a JSON object with key 'random_ints' and a value of 10 random ints in [0-99]"
)
ai_msg.content
' {\\n"random_ints": [\\n13,\\n54,\\n78,\\n45,\\n67,\\n90,\\n11,\\n29,\\n84,\\n33\\n]\\n}'
Token usage:
ai_msg = llm.invoke(messages)
ai_msg.usage_metadata
{'input_tokens': 37, 'output_tokens': 6, 'total_tokens': 43}
Response metadata .. code-block:: python
ai_msg = llm.invoke(messages)
ai_msg.response_metadata
.. code-block:: python
{
'token_usage': {
'completion_tokens': 4,
'prompt_tokens': 19,
'total_tokens': 23
},
'model_name': 'mistralai/Mixtral-8x7B-Instruct-v0.1',
'system_fingerprint': None,
'finish_reason': 'eos',
'logprobs': None
}
Reasoning with gpt-oss-120b:
.. code-block:: python
llm = ChatCerebras( model="gpt-oss-120b", reasoning_effort="high" # "low", "medium", or "high" ) response = llm.invoke("What is the cube root of 50.653?")
for block in response.content: if isinstance(block, dict): if block["type"] == "reasoning_content": reasoning_text = block["reasoning_content"]["text"] print(f"Reasoning: {reasoning_text}") elif block["type"] == "text": print(f"Answer: {block['text']}")
Reasoning with zai-glm-4.6: .. code-block:: python
llm = ChatCerebras(
model="zai-glm-4.6",
disable_reasoning=False # Enable reasoning
)
response = llm.invoke("Explain quantum computing")
# Same access pattern for reasoning content
for block in response.content:
if isinstance(block, dict):
if block["type"] == "reasoning_content":
print(f"Reasoning: {block['reasoning_content']['text']}")
elif block["type"] == "text":
print(f"Answer: {block['text']}")
Reasoning with streaming:
.. code-block:: python
llm = ChatCerebras( model="gpt-oss-120b", reasoning_effort="medium" )
full_reasoning = "" full_text = ""
for chunk in llm.stream("What is 2+2?"): # Reasoning tokens are in additional_kwargs during streaming if "reasoning" in chunk.additional_kwargs: full_reasoning += chunk.additional_kwargs["reasoning"] if isinstance(chunk.content, str): full_text += chunk.content
print(f"Reasoning: {full_reasoning}") print(f"Answer: {full_text}")