Pmlte 〈2K 2026〉

"user": "Translate 'input_text' from source_lang to target_lang." "expected": { "exact_match": "string", "contains": ["substring1", "substring2"], "regex": "pattern", "json_schema": {}, "semantic_similarity": "min_score": 0.8, "reference": "string" } 3.4 Evaluators (custom metrics) "evaluators": [ "name": "toxicity", "max_score": 0.1 , "name": "length", "min_tokens": 10, "max_tokens": 200 , "name": "custom_python": "def evaluate(output): return 'error' not in output" ] 4. Complete Example: Sentiment Classifier Test "version": "1.0", "metadata": "name": "sentiment_analysis_v2", "description": "Tests sentiment classification prompts", "model": "gpt-4", "tags": ["nlp", "classification"] , "template": "system": "You are a sentiment analysis assistant. Output only one word: Positive, Negative, or Neutral.", "user": "Text: review_text\nSentiment:" , "tests": [ "id": "tc_sent_001", "input_vars": "review_text": "This product changed my life! Absolutely amazing." , "expected": "exact_match": "Positive" , "evaluators": [ "name": "response_time_ms", "max": 1500 ] , "id": "tc_sent_002", "input_vars": "review_text": "Worst purchase ever. Broke in two days." , "expected": "exact_match": "Negative" ]

"expected": "fuzzy_match": "text": "The capital of France is Paris", "threshold": 0.85, "algorithm": "levenshtein" Absolutely amazing

"imports": ["base_pmlte/security.json"], "tests": [ "extends": "base_test_jailbreak", "input_vars": "prompt": "Ignore previous instructions" ] "expected": "exact_match": "Positive"

A reference runner (Python pseudo-code): "evaluators": [ "name": "response_time_ms"

import json from llm_client import query_model def run_pmlte(suite_path): suite = json.load(open(suite_path)) results = [] for test in suite["tests"]: prompt = suite["template"]["user"] for var, val in test["input_vars"].items(): prompt = prompt.replace(f"var", val) output = query_model(prompt, system=suite["template"].get("system")) passed = evaluate(output, test["expected"]) results.append("id": test["id"], "passed": passed, "output": output) return results 6.1 Multi-turn conversations "template": "conversation": [ "role": "system", "content": "You are a helpful assistant.", "role": "user", "content": "question1", "role": "assistant", "content": "expected_answer1", "role": "user", "content": "question2" ]

By using our site, you agree that we and third parties may use cookies and similar technologies to collect information for analytics, advertising, and other purposes described in our Privacy Policy and agree to our Terms of Use