#!pip install langchain openai weaviate-client ragas
OPENAI_API_KEY="<YOUR_OPENAI_API_KEY>"
import requests
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
url = "https://raw.githubusercontent.com/langchain-ai/langchain/master/docs/docs/modules/state_of_the_union.txt"
res = requests.get(url)
with open("state_of_the_union.txt", "w") as f:
f.write(res.text)
# Load the data
loader = TextLoader('./state_of_the_union.txt')
documents = loader.load()
# Chunk the data
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Weaviate
import weaviate
from weaviate.embedded import EmbeddedOptions
from dotenv import load_dotenv,find_dotenv
# Load OpenAI API key from .env file
load_dotenv(find_dotenv())
# Setup vector database
client = weaviate.Client(
embedded_options = EmbeddedOptions()
)
# Populate vector database
vectorstore = Weaviate.from_documents(
client = client,
documents = chunks,
embedding = OpenAIEmbeddings(),
by_text = False
)
# Define vectorstore as retriever to enable semantic search
retriever = vectorstore.as_retriever()
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
# Define LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
# Define prompt template
template = """You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Use two sentences maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:
"""
prompt = ChatPromptTemplate.from_template(template)
# Setup RAG pipeline
rag_chain = (
{"context": retriever, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
from datasets import Dataset
questions = ["What did the president say about Justice Breyer?",
"What did the president say about Intel's CEO?",
"What did the president say about gun violence?",
]
ground_truths = [["The president said that Justice Breyer has dedicated his life to serve the country and thanked him for his service."],
["The president said that Pat Gelsinger is ready to increase Intel's investment to $100 billion."],
["The president asked Congress to pass proven measures to reduce gun violence."]]
answers = []
contexts = []
# Inference
for query in questions:
answers.append(rag_chain.invoke(query))
contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])
# To dict
data = {
"question": questions,
"answer": answers,
"contexts": contexts,
"ground_truths": ground_truths
}
# Convert dict to dataset
dataset = Dataset.from_dict(data)
from ragas import evaluate
from ragas.metrics import (
faithfulness,
answer_relevancy,
context_recall,
context_precision,
)
result = evaluate(
dataset = dataset,
metrics=[
context_precision,
context_recall,
faithfulness,
answer_relevancy,
],
)
df = result.to_pandas()