Setup:

Install langchain-cerebras and set environment variable CEREBRAS_API_KEY.

pip install -U langchain-cerebras
export CEREBRAS_API_KEY="your-api-key"

Key init args — completion params: model: str Name of model to use. temperature: Optional[float] Sampling temperature. max_tokens: Optional[int] Max number of tokens to generate. reasoning_effort: Optional[Literal["low", "medium", "high"]] Level of reasoning effort for gpt-oss-120b model. disable_reasoning: Optional[bool] Whether to disable reasoning for zai-glm-4.6 model.

Key init args — client params: timeout: Union[float, Tuple[float, float], Any, None] Timeout for requests. max_retries: Optional[int] Max number of retries. api_key: Optional[str] Cerebras API key. If not passed in will be read from env var CEREBRAS_API_KEY.

Instantiate:

from langchain_cerebras import ChatCerebras

llm = ChatCerebras(
    model="llama-3.3-70b",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # api_key="...",
    # other params...
)

Invoke:

messages = [
    (
        "system",
        "You are a helpful translator. Translate the user sentence to French.",
    ),
    ("human", "I love programming."),
]
llm.invoke(messages)

AIMessage(
    content='The translation of "I love programming" to French is:\n\n"J\'adore programmer."',
    response_metadata={
        'token_usage': {'completion_tokens': 20, 'prompt_tokens': 32, 'total_tokens': 52},
        'model_name': 'llama-3.3-70b',
        'system_fingerprint': 'fp_679dff74c0',
        'finish_reason': 'stop',
    },
    id='run-377c2887-30ef-417e-b0f5-83efc8844f12-0',
    usage_metadata={'input_tokens': 32, 'output_tokens': 20, 'total_tokens': 52})

Stream:

for chunk in llm.stream(messages):
    print(chunk)

content='' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content='The' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content=' translation' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content=' of' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content=' "' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content='I' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content=' love' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content=' programming' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content='"' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content=' to' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content=' French' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content=' is' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content=':\n\n' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content='"' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content='J' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content="'" id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content='ad' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content='ore' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content=' programmer' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content='."' id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'
content='' response_metadata={'finish_reason': 'stop', 'model_name': 'llama-3.3-70b', 'system_fingerprint': 'fp_679dff74c0'} id='run-3f9dc84e-208f-48da-b15d-e552b6759c24'

Async:

await llm.ainvoke(messages)

# stream:
# async for chunk in (await llm.astream(messages))

# batch:
# await llm.abatch([messages])

AIMessage(
    content='The translation of "I love programming" to French is:\n\n"J\'adore programmer."',
    response_metadata={
        'token_usage': {'completion_tokens': 20, 'prompt_tokens': 32, 'total_tokens': 52},
        'model_name': 'llama-3.3-70b',
        'system_fingerprint': 'fp_679dff74c0',
        'finish_reason': 'stop',
    },
    id='run-377c2887-30ef-417e-b0f5-83efc8844f12-0',
    usage_metadata={'input_tokens': 32, 'output_tokens': 20, 'total_tokens': 52})

Tool calling:

from langchain_core.pydantic_v1 import BaseModel, Field

llm = ChatCerebras(model="llama-3.3-70b")

class GetWeather(BaseModel):
    '''Get the current weather in a given location'''

    location: str = Field(
        ..., description="The city and state, e.g. San Francisco, CA"
    )

class GetPopulation(BaseModel):
    '''Get the current population in a given location'''

    location: str = Field(
        ..., description="The city and state, e.g. San Francisco, CA"
    )

llm_with_tools = llm.bind_tools([GetWeather, GetPopulation])
ai_msg = llm_with_tools.invoke(
    "Which city is bigger: LA or NY?"
)
ai_msg.tool_calls

[
    {
        'name': 'GetPopulation',
        'args': {'location': 'NY'},
        'id': 'call_m5tstyn2004pre9bfuxvom8x',
        'type': 'tool_call'
    },
    {
        'name': 'GetPopulation',
        'args': {'location': 'LA'},
        'id': 'call_0vjgq455gq1av5sp9eb1pw6a',
        'type': 'tool_call'
    }
]

Structured output:

from typing import Optional

from langchain_core.pydantic_v1 import BaseModel, Field

class Joke(BaseModel):
    '''Joke to tell user.'''

    setup: str = Field(description="The setup of the joke")
    punchline: str = Field(description="The punchline to the joke")
    rating: Optional[int] = Field(description="How funny the joke is, from 1 to 10")

structured_llm = llm.with_structured_output(Joke)
structured_llm.invoke("Tell me a joke about cats")

Joke(
    setup='Why was the cat sitting on the computer?',
    punchline='To keep an eye on the mouse!',
    rating=7
)

JSON mode:

json_llm = llm.bind(response_format={"type": "json_object"})
ai_msg = json_llm.invoke(
    "Return a JSON object with key 'random_ints' and a value of 10 random ints in [0-99]"
)
ai_msg.content

' {\\n"random_ints": [\\n13,\\n54,\\n78,\\n45,\\n67,\\n90,\\n11,\\n29,\\n84,\\n33\\n]\\n}'

Token usage:

ai_msg = llm.invoke(messages)
ai_msg.usage_metadata

{'input_tokens': 37, 'output_tokens': 6, 'total_tokens': 43}

Response metadata .. code-block:: python

ai_msg = llm.invoke(messages)
ai_msg.response_metadata

.. code-block:: python

{
    'token_usage': {
        'completion_tokens': 4,
        'prompt_tokens': 19,
        'total_tokens': 23
        },
    'model_name': 'mistralai/Mixtral-8x7B-Instruct-v0.1',
    'system_fingerprint': None,
    'finish_reason': 'eos',
    'logprobs': None
}

Reasoning with gpt-oss-120b:

.. code-block:: python

llm = ChatCerebras( model="gpt-oss-120b", reasoning_effort="high" # "low", "medium", or "high" ) response = llm.invoke("What is the cube root of 50.653?")

Reasoning is exposed as structured content blocks

for block in response.content: if isinstance(block, dict): if block["type"] == "reasoning_content": reasoning_text = block["reasoning_content"]["text"] print(f"Reasoning: {reasoning_text}") elif block["type"] == "text": print(f"Answer: {block['text']}")

Reasoning with zai-glm-4.6: .. code-block:: python

llm = ChatCerebras(
    model="zai-glm-4.6",
    disable_reasoning=False  # Enable reasoning
)
response = llm.invoke("Explain quantum computing")

# Same access pattern for reasoning content
for block in response.content:
    if isinstance(block, dict):
        if block["type"] == "reasoning_content":
            print(f"Reasoning: {block['reasoning_content']['text']}")
        elif block["type"] == "text":
            print(f"Answer: {block['text']}")

Reasoning with streaming:

.. code-block:: python

llm = ChatCerebras( model="gpt-oss-120b", reasoning_effort="medium" )

full_reasoning = "" full_text = ""

for chunk in llm.stream("What is 2+2?"): # Reasoning tokens are in additional_kwargs during streaming if "reasoning" in chunk.additional_kwargs: full_reasoning += chunk.additional_kwargs["reasoning"] if isinstance(chunk.content, str): full_text += chunk.content

print(f"Reasoning: {full_reasoning}") print(f"Answer: {full_text}")

LangChain Assistant

Menu

ChatCerebras

Bases

Attributes

Methods

Reasoning is exposed as structured content blocks

Menu

ChatCerebras

Bases

Used in Docs

Attributes

Methods

Reasoning is exposed as structured content blocks