RAG System Practical Guide: Building an Enterprise Knowledge Base from Scratch
RAG (Retrieval-Augmented Generation) is currently the most practical architecture for enterprises adopting LLMs, enabling AI to answer questions based on internal enterprise knowledge while avoiding hallucinations. This article provides a complete walkthrough from principles to implementation for building a production-grade RAG system.
What Is RAG?
RAG combines information retrieval with text generation:
User Question → Vector Search for Relevant Documents → Generate Answer with Document Context
Compared to fine-tuning, RAG offers the following advantages:
| Comparison | RAG | Fine-tuning |
|---|---|---|
| Knowledge Updates | Real-time | Requires retraining |
| Cost | Low | High |
| Explainability | Traceable sources | Black box |
| Hallucination Issues | Fewer | May worsen |
System Architecture
A complete RAG system consists of the following components:
┌─────────────────────────────────────────────────────────────┐
│ RAG System Architecture │
├─────────────────────────────────────────────────────────────┤
│ │
│ ┌───────────────┐ │
│ │ Data Sources │ │
│ │ PDF/Word/Web │ │
│ └───────┬───────┘ │
│ │ │
│ ▼ │
│ ┌───────────────┐ ┌───────────────┐ │
│ │ Document │────▶│ Embedding │ │
│ │ Processing │ │ Vectorization│ │
│ │ Chunking │ │ │ │
│ └───────────────┘ └───────┬───────┘ │
│ │ │
│ ▼ │
│ ┌───────────────┐ │
│ │ Vector Database│ │
│ │ Pinecone │ │
│ └───────┬───────┘ │
│ │ │
│ ┌───────────────┐ │ │
│ │ User Question │◀──────────┘ │
│ └───────┬───────┘ Similarity Search │
│ │ │
│ ▼ │
│ ┌───────────────┐ ┌───────────────┐ │
│ │ Prompt │────▶│ LLM │ │
│ │ Composition │ │ Generate │ │
│ └───────────────┘ └───────────────┘ │
│ │
└─────────────────────────────────────────────────────────────┘
Document Processing
1. Document Parsing
Support for parsing documents in multiple formats:
from langchain_community.document_loaders import (
PyPDFLoader,
Docx2txtLoader,
UnstructuredHTMLLoader,
)
from pathlib import Path
class DocumentProcessor:
def __init__(self):
self.loaders = {
'.pdf': PyPDFLoader,
'.docx': Docx2txtLoader,
'.html': UnstructuredHTMLLoader,
}
def load_document(self, file_path: str) -> list:
"""Load document and return a list of document objects"""
suffix = Path(file_path).suffix.lower()
if suffix not in self.loaders:
raise ValueError(f"Unsupported document format: {suffix}")
loader = self.loaders[suffix](file_path)
documents = loader.load()
# Add metadata
for doc in documents:
doc.metadata['source'] = file_path
doc.metadata['file_type'] = suffix
return documents
2. Document Chunking
Chunking strategy directly impacts retrieval quality:
from langchain.text_splitter import RecursiveCharacterTextSplitter
class ChunkingStrategy:
"""Document chunking strategies"""
@staticmethod
def recursive_split(documents: list, chunk_size: int = 1000,
chunk_overlap: int = 200) -> list:
"""Recursive character splitting - general-purpose strategy"""
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=["\n\n", "\n", "。", ",", " ", ""],
)
return splitter.split_documents(documents)
@staticmethod
def semantic_split(documents: list, embeddings) -> list:
"""Semantic splitting - based on content similarity"""
from langchain_experimental.text_splitter import SemanticChunker
splitter = SemanticChunker(
embeddings,
breakpoint_threshold_type="percentile",
breakpoint_threshold_amount=95,
)
return splitter.split_documents(documents)
Chunking Parameter Selection
| Document Type | chunk_size | chunk_overlap | Description |
|---|---|---|---|
| Technical Docs | 1000-1500 | 200-300 | Preserve complete paragraphs |
| Legal Contracts | 500-800 | 100-150 | Precise retrieval |
| Customer FAQ | 300-500 | 50-100 | Q&A pairing |
| Long Reports | 1500-2000 | 300-400 | Preserve context |
Vectorization (Embedding)
Choosing an Embedding Model
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
# OpenAI Embedding (high accuracy, requires API cost)
openai_embeddings = OpenAIEmbeddings(
model="text-embedding-3-small", # or text-embedding-3-large
)
# Local Embedding (free, suitable for sensitive data)
local_embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
model_kwargs={'device': 'cuda'}, # Use GPU
)
Embedding Model Comparison
| Model | Dimensions | Chinese Support | Cost |
|---|---|---|---|
| text-embedding-3-small | 1536 | Good | $0.02/1M tokens |
| text-embedding-3-large | 3072 | Good | $0.13/1M tokens |
| multilingual-e5-large | 1024 | Excellent | Free (local) |
| bge-large-zh | 1024 | Excellent | Free (local) |
Vector Database
Pinecone Implementation
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
class VectorStoreManager:
def __init__(self, api_key: str, index_name: str):
self.pc = Pinecone(api_key=api_key)
self.index_name = index_name
def create_index(self, dimension: int = 1536):
"""Create a vector index"""
if self.index_name not in self.pc.list_indexes().names():
self.pc.create_index(
name=self.index_name,
dimension=dimension,
metric="cosine",
spec=ServerlessSpec(
cloud="aws",
region="us-east-1"
)
)
def get_vectorstore(self, embeddings) -> PineconeVectorStore:
"""Get vector store instance"""
return PineconeVectorStore(
index=self.pc.Index(self.index_name),
embedding=embeddings,
text_key="text",
)
def add_documents(self, vectorstore, documents: list):
"""Add documents to the vector store"""
vectorstore.add_documents(documents)
Local Alternative
from langchain_community.vectorstores import Chroma
# Chroma (suitable for development and small-scale deployments)
vectorstore = Chroma.from_documents(
documents=chunks,
embedding=embeddings,
persist_directory="./chroma_db",
)
Retrieval Strategies
1. Basic Similarity Search
def basic_retrieval(vectorstore, query: str, k: int = 5):
"""Basic vector similarity search"""
return vectorstore.similarity_search(query, k=k)
2. Hybrid Search
Combining vector search with keyword search:
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
def hybrid_retrieval(documents: list, vectorstore, query: str):
"""Hybrid retrieval: Vector + BM25"""
# BM25 keyword search
bm25_retriever = BM25Retriever.from_documents(documents)
bm25_retriever.k = 5
# Vector search
vector_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
# Hybrid (weights are adjustable)
ensemble_retriever = EnsembleRetriever(
retrievers=[bm25_retriever, vector_retriever],
weights=[0.4, 0.6], # BM25 40%, Vector 60%
)
return ensemble_retriever.invoke(query)
3. Reranking
Using a cross-encoder to rerank retrieval results:
from langchain.retrievers import ContextualCompressionRetriever
from langchain_cohere import CohereRerank
def reranking_retrieval(vectorstore, query: str):
"""Reranking with Cohere Rerank"""
base_retriever = vectorstore.as_retriever(search_kwargs={"k": 20})
reranker = CohereRerank(
model="rerank-multilingual-v3.0",
top_n=5,
)
compression_retriever = ContextualCompressionRetriever(
base_compressor=reranker,
base_retriever=base_retriever,
)
return compression_retriever.invoke(query)
Answer Generation
Prompt Design
from langchain_core.prompts import ChatPromptTemplate
RAG_PROMPT = ChatPromptTemplate.from_template("""
You are a professional enterprise knowledge base assistant. Please answer the user's question based on the provided materials.
Rules:
1. Only answer based on the provided materials; do not fabricate information
2. If the materials do not contain relevant content, clearly state so
3. Answers should be concise and professional; use bullet points when necessary
4. Cite the data sources at the end of your answer
Reference Materials:
{context}
User Question: {question}
Please answer:
""")
Complete RAG Chain
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
def create_rag_chain(vectorstore, model_name: str = "gpt-4"):
"""Build a RAG Chain"""
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
llm = ChatOpenAI(model=model_name, temperature=0)
def format_docs(docs):
return "\n\n".join([
f"[Source: {doc.metadata.get('source', 'Unknown')}]\n{doc.page_content}"
for doc in docs
])
rag_chain = (
{
"context": retriever | format_docs,
"question": RunnablePassthrough()
}
| RAG_PROMPT
| llm
| StrOutputParser()
)
return rag_chain
# Usage
chain = create_rag_chain(vectorstore)
answer = chain.invoke("What is the company's leave policy?")
Advanced Optimization
1. Query Rewriting
REWRITE_PROMPT = ChatPromptTemplate.from_template("""
Rewrite the following user question into a form more suitable for retrieval.
Preserve the original intent but use more precise keywords.
Original question: {question}
Rewritten question:
""")
def query_rewrite(llm, question: str) -> str:
chain = REWRITE_PROMPT | llm | StrOutputParser()
return chain.invoke({"question": question})
2. Multi-Query Retrieval
from langchain.retrievers.multi_query import MultiQueryRetriever
def multi_query_retrieval(vectorstore, llm, question: str):
"""Retrieve from multiple perspectives"""
retriever = MultiQueryRetriever.from_llm(
retriever=vectorstore.as_retriever(),
llm=llm,
)
return retriever.invoke(question)
3. Source Citation
from langchain_core.runnables import RunnableParallel
def create_chain_with_sources(vectorstore, llm):
"""Build a Chain with source citations"""
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
def format_docs_with_sources(docs):
formatted = []
sources = []
for i, doc in enumerate(docs):
formatted.append(f"[{i+1}] {doc.page_content}")
sources.append(doc.metadata.get('source', 'Unknown'))
return {
"context": "\n\n".join(formatted),
"sources": sources
}
chain = RunnableParallel(
answer=(
{"context": retriever | format_docs_with_sources,
"question": RunnablePassthrough()}
| RAG_PROMPT
| llm
| StrOutputParser()
),
sources=retriever | (lambda docs: [d.metadata.get('source') for d in docs])
)
return chain
Evaluation and Monitoring
Retrieval Quality Evaluation
def evaluate_retrieval(vectorstore, test_queries: list):
"""Evaluate retrieval quality"""
results = []
for query_data in test_queries:
query = query_data['query']
expected_doc = query_data['expected_source']
retrieved = vectorstore.similarity_search(query, k=5)
retrieved_sources = [d.metadata.get('source') for d in retrieved]
# Calculate Hit@5
hit = expected_doc in retrieved_sources
rank = retrieved_sources.index(expected_doc) + 1 if hit else -1
results.append({
'query': query,
'hit': hit,
'rank': rank,
})
# Calculate metrics
hit_rate = sum(r['hit'] for r in results) / len(results)
mrr = sum(1/r['rank'] for r in results if r['hit']) / len(results)
return {'hit_rate': hit_rate, 'mrr': mrr}
Conclusion
Building a production-grade RAG system requires attention to:
- Document Processing: Choosing the appropriate chunking strategy
- Embedding: Selecting models based on language and cost requirements
- Retrieval Strategy: Hybrid search + reranking to improve accuracy
- Prompt Design: Guiding the model to properly use retrieval results
- Continuous Optimization: Establishing evaluation mechanisms for iterative improvement
If you are planning an enterprise knowledge base or AI assistant project, feel free to contact us for a discussion.