pip install deepeval
from deepeval.metrics.ragas import (
RAGASContextualPrecisionMetric,
RAGASFaithfulnessMetric,
RAGASContextualRecallMetric,
RAGASAnswerRelevancyMetric,
)
contextual_precision = RAGASContextualPrecisionMetric()
contextual_recall = RAGASContextualRecallMetric()
answer_relevancy = RAGASAnswerRelevancyMetric()
faithfulness = RAGASFaithfulnessMetric()
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
bias = GEval(
name="Bias",
criteria="Coherence - determine if the actual output has an inherent bias against Asian culture.",
evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
)
from deepeval import evaluate
from deepeval.test_case import LLMTestCase
test_case = LLMTestCase(
input="",
actual_output="",
expected_output="",
retrieval_context=[""]
)
evaluate(
test_cases=[test_case],
metrics=[
contextual_precision,
contextual_recall,
answer_relevancy,
faithfulness,
bias
]
)
pip install deepeval
deepeval login
touch test_rag.py
from deepeval.metrics.ragas import (
RAGASContextualPrecisionMetric,
RAGASFaithfulnessMetric,
RAGASContextualRecallMetric,
RAGASAnswerRelevancyMetric,
)
from deepeval.metrics import BiasMetric
bias = BiasMetric(threshold=
0.5
)
contextual_precision = RAGASContextualPrecisionMetric(threshold=
0.5
)
contextual_recall = RAGASContextualRecallMetric(threshold=
0.5
)
answer_relevancy = RAGASAnswerRelevancyMetric(threshold=
0.5
)
faithfulness = RAGASFaithfulnessMetric(threshold=
0.5
)
...
# Replace this with your own data
input_output_pairs = [
{
"input": "...",
"expected_output": "...",
},
{
"input": "...",
"expected_output": "...",
}
]
import pytest
from deepeval import assert_test
from deepeval.metrics.ragas import (
RAGASContextualPrecisionMetric,
RAGASFaithfulnessMetric,
RAGASContextualRecallMetric,
RAGASAnswerRelevancyMetric,
)
from deepeval.metrics import BiasMetric
from deepeval.test_case import LLMTestCase
#######################################
# Initialize metrics with thresholds ##
#######################################
bias = BiasMetric(threshold=
0.5
)
contextual_precision = RAGASContextualPrecisionMetric(threshold=
0.5
)
contextual_recall = RAGASContextualRecallMetric(threshold=
0.5
)
answer_relevancy = RAGASAnswerRelevancyMetric(threshold=
0.5
)
faithfulness = RAGASFaithfulnessMetric(threshold=
0.5
)
#######################################
# Specify evaluation metrics to use ###
#######################################
evaluation_metrics = [
bias,
contextual_precision,
contextual_recall,
answer_relevancy,
faithfulness
]
#######################################
# Specify inputs to test RAG app on ###
#######################################
input_output_pairs = [
{
"input": "",
"expected_output": "",
},
{
"input": "",
"expected_output": "",
}
]
#######################################
# Loop through input output pairs #####
#######################################
@pytest.mark.parametrize(
"input_output_pair",
input_output_pairs,
)
def test_llamaindex(input_output_pair: Dict):
input = input_output_pair.get("input", None)
expected_output = input_output_pair.get("expected_output", None)
# Hypothentical RAG application for demonstration only.
# Replace this with your own RAG implementation.
# The idea is you'll be generating LLM outputs and
# getting the retrieval context at evaluation time for each input
actual_output = rag_application.query(input)
retrieval_context = rag_application.get_retrieval_context()
test_case = LLMTestCase(
input=input,
actual_output=actual_output,
retrieval_context=retrieval_context,
expected_output=expected_output
)
# assert test case
assert_test(test_case, evaluation_metrics)
deepeval test run test_rag.py
name: RAG Deployment Evaluations
on:
push:
jobs:
test:
runs-on: ubuntu-latest
steps:
# Some extra steps to setup and install dependencies
...
# Optional Login
- name: Login to Confident
env:
CONFIDENT_API_KEY: ${{ secrets.CONFIDENT_API_KEY }}
run: poetry run deepeval login --confident-api-key "$CONFIDENT_API_KEY"
- name: Run deepeval tests
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: poetry run deepeval test run test_rag.py