|
| 1 | +import warnings |
| 2 | +import sdialog |
| 3 | + |
| 4 | +from sdialog import Dialog |
| 5 | +from sdialog.evaluation import LLMJudgeYesNo, ToolSequenceValidator |
| 6 | +from sdialog.evaluation import FrequencyEvaluator |
| 7 | +from sdialog.evaluation import Comparator |
| 8 | + |
| 9 | +# Hide all UserWarnings |
| 10 | +warnings.filterwarnings("ignore", category=UserWarning) |
| 11 | + |
| 12 | +sdialog.config.llm("openai:gpt-4.1") |
| 13 | +sdialog.config.cache(True) |
| 14 | + |
| 15 | +LLMS = ["qwen3:0.6b", "qwen3:1.7b", "qwen3:8b", "qwen3:14b"] |
| 16 | + |
| 17 | +# --- Dialog Metrics ---- |
| 18 | +# 1) Did the agent ask for verification? |
| 19 | +judge_ask_v = LLMJudgeYesNo("Did the support agent tried verifying the customer's " |
| 20 | + "account by asking for the account ID in this dialog?", |
| 21 | + reason=True) |
| 22 | + |
| 23 | +# 2) Did the agent call the right tools? |
| 24 | +# Case A: first verify then update |
| 25 | +tool_seq_v = ToolSequenceValidator(["verify_account", "update_billing_address"]) |
| 26 | +# Case B: do not verify and get plans |
| 27 | +tool_seq_no_v = ToolSequenceValidator(["not:verify_account", "get_service_plans"]) |
| 28 | + |
| 29 | +# --- Dataset Evaluators ---- |
| 30 | +freq_judge_ask_v = FrequencyEvaluator(judge_ask_v, |
| 31 | + name="Ask-Verify", |
| 32 | + plot_title="Account Verification Request Rate (LLM Judge)", |
| 33 | + plot_xlabel="LLM Model", |
| 34 | + plot_ylabel="Verification Requested (%)") |
| 35 | +freq_tool_seq_v = FrequencyEvaluator(tool_seq_v, |
| 36 | + name="Tools-OK", |
| 37 | + plot_title="Tool Usage Evaluation", |
| 38 | + plot_xlabel="LLM Model", |
| 39 | + plot_ylabel="Success (%)") |
| 40 | +freq_tool_seq_no_v = FrequencyEvaluator(tool_seq_no_v, |
| 41 | + name="Tools-OK", |
| 42 | + plot_title="Tool Usage Evaluation", |
| 43 | + plot_xlabel="LLM Model", |
| 44 | + plot_ylabel="Success (%)") |
| 45 | + |
| 46 | +# --- Dataset Comparator ---- |
| 47 | +# Case A: requiring verification |
| 48 | +comparator_v = Comparator(evaluators=[freq_judge_ask_v, freq_tool_seq_v]) |
| 49 | +print("\nResults - Requires Verification") |
| 50 | +comparator_v({ |
| 51 | + "qwen3:0.6b": Dialog.from_folder("output/requires_verification/qwen3:0.6b/"), |
| 52 | + "qwen3:1.7b": Dialog.from_folder("output/requires_verification/qwen3:1.7b/"), |
| 53 | + "qwen3:8b": Dialog.from_folder("output/requires_verification/qwen3:8b/"), |
| 54 | + "qwen3:14b": Dialog.from_folder("output/requires_verification/qwen3:14b/") |
| 55 | +}) |
| 56 | +comparator_v.plot(save_folder_path="output/requires_verification") |
| 57 | + |
| 58 | +# Case B: not requiring verification |
| 59 | +comparator_no_v = Comparator(evaluators=[freq_judge_ask_v, freq_tool_seq_no_v]) |
| 60 | +print("\nResults - No Verification Required") |
| 61 | +comparator_no_v({ |
| 62 | + "qwen3:0.6b": Dialog.from_folder("output/no_verification/qwen3:0.6b/"), |
| 63 | + "qwen3:1.7b": Dialog.from_folder("output/no_verification/qwen3:1.7b/"), |
| 64 | + "qwen3:8b": Dialog.from_folder("output/no_verification/qwen3:8b/"), |
| 65 | + "qwen3:14b": Dialog.from_folder("output/no_verification/qwen3:14b/") |
| 66 | +}) |
| 67 | +comparator_no_v.plot(save_folder_path="output/no_verification") |
0 commit comments