Skip to content

Evaluating Agents

Learn how to comprehensively evaluate AI agents using Azure AI Evaluation's advanced capabilities and best practices.

Evaluation Strategy

1. Performance Assessment

  • Response accuracy
  • Processing speed
  • Resource efficiency
  • Scalability metrics
  • Error rates
  • Recovery times

2. Behavioral Analysis

  • Decision quality
  • Adaptation capability
  • Learning patterns
  • Error handling
  • Context awareness
  • Task completion

3. User Experience

  • Interaction quality
  • Response relevance
  • User satisfaction
  • Task efficiency
  • Error recovery
  • Overall usability

Evaluation Methods

1. Quantitative Analysis

from azure.ai.evaluation import EvaluationClient
from azure.identity import DefaultAzureCredential
import pandas as pd
import numpy as np

def perform_quantitative_analysis(agent_id: str, test_cases: list):
    """Perform quantitative analysis of agent performance."""
    try:
        # Initialize evaluation client
        credential = DefaultAzureCredential()
        client = EvaluationClient(
            subscription_id=os.getenv("AZURE_SUBSCRIPTION_ID"),
            resource_group=os.getenv("AZURE_RESOURCE_GROUP"),
            credential=credential
        )

        # Collect performance metrics
        metrics = []
        for test in test_cases:
            result = client.evaluate_agent(
                agent_id=agent_id,
                input_data=test["input"],
                expected_output=test["expected"],
                metrics=["response_time", "accuracy", "resource_usage"]
            )
            metrics.append(result)

        # Analyze results
        df = pd.DataFrame(metrics)
        analysis = {
            "avg_response_time": df["response_time"].mean(),
            "accuracy_rate": df["accuracy"].mean(),
            "avg_resource_usage": df["resource_usage"].mean(),
            "error_rate": 1 - df["success"].mean(),
            "p95_response_time": df["response_time"].quantile(0.95)
        }

        return analysis
    except Exception as e:
        print(f"Error in quantitative analysis: {str(e)}")
        raise

2. Qualitative Analysis

def perform_qualitative_analysis(agent_id: str, test_cases: list):
    """Perform qualitative analysis of agent responses."""
    try:
        # Initialize evaluation client
        credential = DefaultAzureCredential()
        client = EvaluationClient(
            subscription_id=os.getenv("AZURE_SUBSCRIPTION_ID"),
            resource_group=os.getenv("AZURE_RESOURCE_GROUP"),
            credential=credential
        )

        # Evaluate response quality
        quality_metrics = []
        for test in test_cases:
            result = client.evaluate_response_quality(
                agent_id=agent_id,
                input_data=test["input"],
                response=test["response"],
                evaluation_criteria={
                    "relevance": True,
                    "coherence": True,
                    "completeness": True,
                    "correctness": True
                }
            )
            quality_metrics.append(result)

        # Analyze quality results
        quality_analysis = {
            "avg_relevance": np.mean([m["relevance"] for m in quality_metrics]),
            "avg_coherence": np.mean([m["coherence"] for m in quality_metrics]),
            "avg_completeness": np.mean([m["completeness"] for m in quality_metrics]),
            "avg_correctness": np.mean([m["correctness"] for m in quality_metrics])
        }

        return quality_analysis
    except Exception as e:
        print(f"Error in qualitative analysis: {str(e)}")
        raise

3. Comparative Analysis

def perform_comparative_analysis(agent_ids: list, benchmark_data: dict):
    """Compare multiple agent versions or implementations."""
    try:
        # Initialize evaluation client
        credential = DefaultAzureCredential()
        client = EvaluationClient(
            subscription_id=os.getenv("AZURE_SUBSCRIPTION_ID"),
            resource_group=os.getenv("AZURE_RESOURCE_GROUP"),
            credential=credential
        )

        # Compare agents
        comparison_results = []
        for agent_id in agent_ids:
            # Evaluate against benchmark
            result = client.evaluate_agent_benchmark(
                agent_id=agent_id,
                benchmark_data=benchmark_data,
                metrics=["accuracy", "response_time", "resource_usage"]
            )

            # Calculate performance score
            performance_score = calculate_performance_score(result)
            comparison_results.append({
                "agent_id": agent_id,
                "performance_score": performance_score,
                "metrics": result
            })

        return comparison_results
    except Exception as e:
        print(f"Error in comparative analysis: {str(e)}")
        raise

def calculate_performance_score(result: dict) -> float:
    """Calculate overall performance score based on multiple metrics."""
    weights = {
        "accuracy": 0.4,
        "response_time": 0.3,
        "resource_usage": 0.3
    }

    score = (
        result["accuracy"] * weights["accuracy"] +
        (1 / result["response_time"]) * weights["response_time"] +
        (1 / result["resource_usage"]) * weights["resource_usage"]
    )

    return score

## Evaluation Scenarios

### 1. Functional Testing
```python
def perform_functional_testing(agent_id: str):
    """Execute functional tests for core agent capabilities."""
    try:
        # Initialize evaluation client
        credential = DefaultAzureCredential()
        client = EvaluationClient(
            subscription_id=os.getenv("AZURE_SUBSCRIPTION_ID"),
            resource_group=os.getenv("AZURE_RESOURCE_GROUP"),
            credential=credential
        )

        # Define test cases
        test_cases = [
            {
                "name": "Basic Response",
                "input": "Hello, how are you?",
                "expected": "greeting"
            },
            {
                "name": "Task Completion",
                "input": "Schedule a meeting for tomorrow",
                "expected": "task_scheduling"
            },
            {
                "name": "Error Handling",
                "input": "Invalid request format",
                "expected": "error_response"
            }
        ]

        # Execute tests
        results = []
        for test in test_cases:
            result = client.test_agent_function(
                agent_id=agent_id,
                test_case=test,
                validation_rules={
                    "response_format": True,
                    "error_handling": True,
                    "completion_check": True
                }
            )
            results.append(result)

        return results
    except Exception as e:
        print(f"Error in functional testing: {str(e)}")
        raise

2. Load Testing

async def perform_load_testing(agent_id: str, concurrent_users: int):
    """Execute load tests to assess agent performance under stress."""
    try:
        # Initialize evaluation client
        credential = DefaultAzureCredential()
        client = EvaluationClient(
            subscription_id=os.getenv("AZURE_SUBSCRIPTION_ID"),
            resource_group=os.getenv("AZURE_RESOURCE_GROUP"),
            credential=credential
        )

        # Configure load test
        load_config = {
            "concurrent_users": concurrent_users,
            "duration_seconds": 300,
            "ramp_up_seconds": 60,
            "test_scenarios": [
                {
                    "weight": 0.7,
                    "input": "Standard request",
                },
                {
                    "weight": 0.3,
                    "input": "Complex request",
                }
            ]
        }

        # Execute load test
        result = await client.execute_load_test(
            agent_id=agent_id,
            load_config=load_config,
            metrics=["response_time", "error_rate", "throughput"]
        )

        return result
    except Exception as e:
        print(f"Error in load testing: {str(e)}")
        raise

3. Endurance Testing

```python async def perform_endurance_testing(agent_id: str, duration_hours: int): """Execute long-running tests to assess agent stability.""" try: # Initialize evaluation client credential = DefaultAzureCredential() client = EvaluationClient( subscription_id=os.getenv("AZURE_SUBSCRIPTION_ID"), resource_group=os.getenv("AZURE_RESOURCE_GROUP"), credential=credential )

    # Configure endurance test
    test_config = {
        "duration_hours": duration_hours,
        "monitoring_interval_minutes": 5,
        "test_scenarios": [
            "basic_interaction",
            "complex_task",
            "error_scenario"
        ],
        "metrics": [
            "memory_usage",
            "response_time",
            "error_rate",
            "resource_utilization"
        ]
    }

    # Execute endurance test
    result = await client.execute_endurance_test(
        agent_id=agent_id,
        test_config=test_config
    )

    # Analyze stability metrics
    stability_analysis = analyze_stability_metrics(result)

    return stability_analysis
except Exception as e:
    print(f"Error in endurance testing: {str(e)}")
    raise

def analyze_stability_metrics(result: dict) -> dict: """Analyze stability metrics from endurance test results.""" return { "memory_leak_detected": check_memory_growth(result["memory_usage"]), "performance_degradation": calculate_degradation(result["response_time"]), "error_accumulation": analyze_error_pattern(result["error_rate"]), "resource_efficiency": calculate_resource_efficiency(result["resource_utilization"]) }

Analysis and Reporting

1. Data Collection

  • Metric gathering
  • Log analysis
  • User feedback
  • System metrics
  • Cost data
  • Performance data

2. Analysis Methods

  • Statistical analysis
  • Pattern recognition
  • Trend analysis
  • Anomaly detection
  • Root cause analysis
  • Impact assessment

3. Report Generation

  • Performance reports
  • Quality metrics
  • Resource usage
  • Cost analysis
  • Improvement recommendations
  • Action plans

Continuous Improvement

1. Performance Optimization

  • Resource tuning
  • Response optimization
  • Error reduction
  • Cost efficiency
  • Quality improvement
  • Process refinement

2. Feature Enhancement

  • Capability expansion
  • Integration improvement
  • Security enhancement
  • User experience
  • Documentation updates
  • Training materials

3. Process Improvement

  • Workflow optimization
  • Tool enhancement
  • Documentation updates
  • Team training
  • Knowledge sharing
  • Best practices

Next: Workshop Conclusion