Tidy pipelines and structured output

knitr::opts_chunk$set(
  collapse = TRUE, comment = "#>",
  eval = identical(tolower(Sys.getenv("LLMR_RUN_VIGNETTES", "false")), "true")
)

We’ll show both unstructured and structured pipelines, using four model names: - gpt-5-nano (OpenAI) - claude-sonnet-4-20250514 (Anthropic) - gemini-2.5-flash (Gemini) - openai/gpt-oss-20b (Groq)

You will need environment variables OPENAI_API_KEY, ANTHROPIC_API_KEY, GEMINI_API_KEY, GROQ_API_KEY.

library(LLMR)
library(dplyr)

cfg_openai <- llm_config("openai",   "gpt-5-nano")
cfg_cld    <- llm_config("anthropic","claude-sonnet-4-20250514", max_tokens = 512)  # avoid warnings; Anthropic requires max_tokens
cfg_gemini <- llm_config("gemini",   "gemini-2.5-flash")
cfg_groq   <- llm_config("groq",     "openai/gpt-oss-20b")

llm_fn: unstructured (OpenAI)

words <- c("excellent", "awful", "fine")
out <- llm_fn(
  words,
  prompt  = "Classify '{x}' as Positive, Negative, or Neutral.",
  .config = cfg_openai,
  .return = "columns"
)
out
#> # A tibble: 3 × 14
#>   response_text finish_reason sent_tokens rec_tokens total_tokens
#>   <chr>         <chr>               <int>      <int>        <int>
#> 1 Positive      stop                   19        138          157
#> 2 Negative.     stop                   20        204          224
#> 3 Neutral       stop                   19        266          285
#> # ℹ 9 more variables: reasoning_tokens <int>, success <lgl>,
#> #   error_message <chr>, status_code <int>, error_code <chr>, bad_param <chr>,
#> #   response_id <chr>, duration <dbl>, raw_response_json <chr>

llm_fn: unstructured (Groq)

out_groq <- llm_fn(
  words,
  prompt  = "Classify '{x}' as Positive, Negative, or Neutral.",
  .config = cfg_groq,
  .return = "columns"
)
out_groq
#> # A tibble: 3 × 14
#>   response_text finish_reason sent_tokens rec_tokens total_tokens
#>   <chr>         <chr>               <int>      <int>        <int>
#> 1 Positive      stop                   84         50          134
#> 2 Negative      stop                   85         89          174
#> 3 Positive      stop                   84        103          187
#> # ℹ 9 more variables: reasoning_tokens <int>, success <lgl>,
#> #   error_message <chr>, status_code <int>, error_code <chr>, bad_param <chr>,
#> #   response_id <chr>, duration <dbl>, raw_response_json <chr>

llm_fn_structured: schema-first (OpenAI)

schema <- list(
  type = "object",
  properties = list(
    label = list(type = "string", description = "Sentiment label"),
    score = list(type = "number", description = "Confidence 0..1")
  ),
  required = list("label", "score"),
  additionalProperties = FALSE
)

out_s <- llm_fn_structured(
  x = words,
  prompt  = "Classify '{x}' as Positive, Negative, or Neutral with confidence.",
  .config = cfg_openai,
  .schema = schema,
  .fields = c("label", "score")
)
out_s
#> # A tibble: 3 × 20
#>   response_text                finish_reason sent_tokens rec_tokens total_tokens
#>   <chr>                        <chr>               <int>      <int>        <int>
#> 1 "{\"label\":\"Positive\",\"… stop                   69        409          478
#> 2 "{\"label\":\"Negative\",\"… stop                   70        410          480
#> 3 "{\"label\":\"Neutral\",\"s… stop                   69        473          542
#> # ℹ 15 more variables: reasoning_tokens <int>, success <lgl>,
#> #   error_message <chr>, status_code <int>, error_code <chr>, bad_param <chr>,
#> #   response_id <chr>, duration <dbl>, raw_response_json <chr>,
#> #   structured_ok <lgl>, structured_data <list>, label <chr>, score <dbl>,
#> #   structured_valid <lgl>, structured_error <chr>

llm_mutate: unstructured (Anthropic)

df <- tibble::tibble(
  id   = 1:3,
  text = c("Cats are great pets", "The weather is bad", "I like tea")
)

df_u <- df |>
  llm_mutate(
    answer,
    prompt  = "Give a short category for: {text}",
    .config = cfg_cld,
    .return = "columns"
  )

df_u
#> # A tibble: 3 × 15
#>   answer answer_finish answer_sent answer_rec answer_tot answer_reason answer_ok
#>   <chr>  <chr>               <int>      <int>      <int>         <int> <lgl>    
#> 1 **Pet… stop                   18         14         32            NA TRUE     
#> 2 **Wea… stop                   17         14         31            NA TRUE     
#> 3 **Bev… stop                   16         10         26            NA TRUE     
#> # ℹ 8 more variables: answer_err <chr>, answer_id <chr>, answer_status <int>,
#> #   answer_ecode <chr>, answer_param <chr>, answer_t <dbl>, id <int>,
#> #   text <chr>

llm_mutate_structured: structured (Gemini)

schema2 <- list(
  type = "object",
  properties = list(
    category  = list(type = "string"),
    rationale = list(type = "string")
  ),
  required = list("category", "rationale"),
  additionalProperties = FALSE
)

df_s <- df |>
  llm_mutate_structured(
    annot,
    prompt  = "Extract category and a one-sentence rationale for: {text}",
    .config = cfg_gemini,
    .schema = schema2
    # Because a schema is present, fields auto-hoist; you can also pass:
    # .fields = c("category", "rationale")
  )

df_s
#> # A tibble: 3 × 19
#>   annot        annot_finish annot_sent annot_rec annot_tot annot_reason annot_ok
#>   <chr>        <chr>             <int>     <int>     <int>        <int> <lgl>   
#> 1 "{\n  \"cat… stop                 15        31        46          923 TRUE    
#> 2 "{\n  \"cat… stop                 15        30        45          129 TRUE    
#> 3 "{\n  \"cat… stop                 14        36        50          129 TRUE    
#> # ℹ 12 more variables: annot_err <chr>, annot_id <chr>, annot_status <int>,
#> #   annot_ecode <chr>, annot_param <chr>, annot_t <dbl>, id <int>, text <chr>,
#> #   structured_ok <lgl>, structured_data <list>, category <chr>,
#> #   rationale <chr>