def eval_expected_words(
system_message,
question,
expected_words,
human_template="{question}",
# Language model used by the assistant
llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
# Output parser to parse assistant's response
output_parser=StrOutputParser()
):
"""
Evaluate if the assistant's response contains the expected words.
Parameters:
system_message (str): The system message sent to the assistant.
question (str): The user's question.
expected_words (list): List of words expected in the assistant's response.
human_template (str, optional): Template for human-like response formatting.
Defaults to "{question}".
llm (ChatOpenAI, optional): Language model used by the assistant.
Defaults to ChatOpenAI(model="gpt-3.5-turbo", temperature=0).
output_parser (OutputParser, optional): Output parser to parse assistant's response.
Defaults to StrOutputParser().
Raises:
AssertionError: If the expected words are not found in the assistant's response.
"""
# Create an assistant chain with provided parameters
assistant = assistant_chain(
system_message,
human_template,
llm,
output_parser
)
# Invoke the assistant with the user's question
answer = assistant.invoke({"question": question})
# Print the assistant's response
print(answer)
try:
# Check if any of the expected words are present in the assistant's response
assert any(word in answer.lower() \
for word in expected_words), \
# If the expected words are not found, raise an assertion error with a message
f"Expected the assistant questions to include \
'{expected_words}', but it did not"
except Exception as e:
print(f"An error occured: {str(e)}")
def evaluate_refusal(
system_message,
question,
decline_response,
human_template="{question}",
llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
output_parser=StrOutputParser()):
"""
Evaluate if the assistant's response includes a refusal.
Parameters:
system_message (str): The system message sent to the assistant.
question (str): The user's question.
decline_response (str): The expected response from the assistant when refusing.
human_template (str, optional): Template for human-like response formatting.
Defaults to "{question}".
llm (ChatOpenAI, optional): Language model used by the assistant.
Defaults to ChatOpenAI(model="gpt-3.5-turbo", temperature=0).
output_parser (OutputParser, optional): Output parser to parse assistant's response.
Defaults to StrOutputParser().
Raises:
AssertionError: If the assistant's response does not contain the expected refusal.
"""
# Create an assistant chain with provided parameters
assistant = assistant_chain(
human_template,
system_message,
llm,
output_parser
)
# Invoke the assistant with the user's question
answer = assistant.invoke({"question": question})
# Print the assistant's response
print(answer)
try:
# Check if the expected refusal is present in the assistant's response
assert decline_response.lower() in answer.lower(), \
# If the expected refusal is not found, raise an assertion error with a message
f"Expected the bot to decline with '{decline_response}' got {answer}"
except Exception as e:
return(f"An error occured: {str(e)}")
def create_eval_chain(
agent_response,
llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
output_parser=StrOutputParser()
):
"""
Creates an evaluation chain to assess the appropriateness of the agent's response.
Parameters:
agent_response (str): The response generated by the agent.
llm (ChatOpenAI, optional): Language model used for evaluation.
Defaults to ChatOpenAI(model="gpt-3.5-turbo", temperature=0).
output_parser (OutputParser, optional): Output parser for parsing agent's response.
Defaults to StrOutputParser().
Returns:
ChatPromptTemplate: Evaluation chain for assessing the agent's response.
"""
delimiter = "####"
eval_system_prompt = f"""You are an assistant that evaluates whether or not an assistant is producing valid responses.
The assistant should be producing output in the format of [CUSTOM EVALUATION DEPENDING ON USE CASE]."""
eval_user_message = f"""You are evaluating [CUSTOM EVALUATION DEPENDING ON USE CASE].
Here is the data:
[BEGIN DATA]
************
[Response]: {agent_response}
************
[END DATA]
Read the response carefully and determine if it [MEETS REQUIREMENT FOR USE CASE]. Do not evaluate if the information is correct only evaluate if the data is in the expected format.
Output 'True' if the response is appropriate, output 'False' if the response is not appropriate.
"""
eval_prompt = ChatPromptTemplate.from_messages([
("system", eval_system_prompt),
("human", eval_user_message),
])
return eval_prompt | llm | output_parser
def model_grad_eval_format(generated_output):
"""
Evaluates the format of the generated output from the agent.
Parameters:
generated_output (str): The output generated by the agent.
Returns:
bool: True if the response is appropriate, False otherwise.
Raises:
ValueError: If the evaluation result is neither "True" nor "False".
"""
# Create an evaluation chain for assessing the agent's response format
eval_chain = create_eval_chain(generated_output)
# Retrieve the evaluation result from the evaluation chain
evaluation_results_str = eval_chain.invoke({})
# Strip leading/trailing whitespaces and convert to lowercase
retrieval_test_response = evaluation_results_str.strip().lower()
# Check if "true" or "false" is present in the response
try:
if "true" in retrieval_test_response:
return True
elif "false" in retrieval_test_response:
return False
except Exception as e:
return(f"An error occured: {str(e)}")
model_grad_eval_format(generated_output="[UNDESIRED OUTPUT GENERATED BY AGENT]")