You've installed Ollama and Gemma 4 from our previous guide. Now what? A chatbot is fun for 5 minutes, but real AI applications need access to your data (RAG), ability to call tools (function calling), and autonomous reasoning (agents). This guide shows you how to build all three — entirely offline, no cloud APIs, no data leaving your machine.
What We're Building
Part 1: RAG (Retrieval-Augmented Generation)
Problem: LLMs only know what they were trained on. They don't know your company docs, your codebase, or your private data.
Solution: RAG retrieves relevant documents from your data, adds them to the LLM's prompt as context, and the LLM answers based on your data — not just its training data.
Build a Complete RAG System
# pip install langchain langchain-community chromadb ollama sentence-transformers
# ── Step 1: Load your documents ────────────────
from langchain_community.document_loaders import (
DirectoryLoader, PyPDFLoader, TextLoader, CSVLoader
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Load PDFs from a directory
loader = DirectoryLoader(
"./company-docs/",
glob="**/*.pdf",
loader_cls=PyPDFLoader,
)
documents = loader.load()
print(f"Loaded {len(documents)} pages from PDFs")
# Also load text files, markdown, CSV, etc.
# loader = TextLoader("./notes.md")
# loader = CSVLoader("./data.csv")
# ── Step 2: Split into chunks ─────────────────
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, # Characters per chunk
chunk_overlap=200, # Overlap between chunks (for context continuity)
separators=["\n\n", "\n", ". ", " ", ""],
)
chunks = splitter.split_documents(documents)
print(f"Split into {len(chunks)} chunks")
# ── Step 3: Create embeddings + vector store ──
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
# Use Ollama for embeddings too (all local!)
embeddings = OllamaEmbeddings(model="nomic-embed-text")
# Or: model="gemma4:12b" (uses the LLM for embeddings)
# Store in ChromaDB (local vector database)
vectorstore = Chroma.from_documents(
documents=chunks,
embedding=embeddings,
persist_directory="./chroma_db", # Persists to disk
)
print(f"Vector store created with {vectorstore._collection.count()} vectors")
# ── Step 4: Create the RAG chain ──────────────
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
llm = Ollama(model="gemma4:12b", temperature=0.3)
prompt = PromptTemplate.from_template("""
Use the following context to answer the question. If the answer is not
in the context, say "I don't have enough information to answer that."
Context:
{context}
Question: {question}
Answer:""")
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=vectorstore.as_retriever(search_kwargs={"k": 4}),
chain_type_kwargs={"prompt": prompt},
return_source_documents=True,
)
# ── Step 5: Ask questions! ────────────────────
result = qa_chain.invoke({"query": "What was our Q3 revenue?"})
print(f"Answer: {result['result']}")
print(f"Sources: {[doc.metadata['source'] for doc in result['source_documents']]}")
# All running locally — PDFs never leave your machine!
Part 2: Function Calling (Tool Use)
Problem: LLMs can only generate text. They can't check the weather, query a database, or send emails.
Solution: Function calling lets the LLM decide which tool to use and what arguments to pass. Your code executes the tool and returns the result to the LLM.
# Function calling with Ollama + LangChain
from langchain_community.llms import Ollama
from langchain.agents import tool, AgentExecutor, create_react_agent
from langchain.prompts import PromptTemplate
import sqlite3
import datetime
# ── Define tools ───────────────────────────────
@tool
def query_database(sql: str) -> str:
"""Execute a read-only SQL query against the app database.
Only SELECT queries are allowed."""
if not sql.strip().upper().startswith("SELECT"):
return "Error: Only SELECT queries allowed"
conn = sqlite3.connect("app.db")
try:
result = conn.execute(sql).fetchall()
return str(result)
except Exception as e:
return f"SQL Error: {e}"
finally:
conn.close()
@tool
def get_current_time() -> str:
"""Returns the current date and time."""
return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
@tool
def search_files(query: str) -> str:
"""Search for files containing a keyword in the project directory."""
import subprocess
result = subprocess.run(
["grep", "-rl", query, "./src/"],
capture_output=True, text=True, timeout=10
)
return result.stdout or "No files found"
# ── Create the agent ───────────────────────────
llm = Ollama(model="gemma4:12b", temperature=0)
tools = [query_database, get_current_time, search_files]
prompt = PromptTemplate.from_template("""
You are a helpful assistant with access to tools.
Available tools: {tools}
Tool names: {tool_names}
Use the following format:
Question: the input question
Thought: think about what to do
Action: the tool to use
Action Input: input for the tool
Observation: the tool result
... (repeat if needed)
Thought: I now know the answer
Final Answer: the final answer
Question: {input}
{agent_scratchpad}""")
agent = create_react_agent(llm, tools, prompt)
executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
# ── Run it ─────────────────────────────────────
result = executor.invoke({
"input": "How many users signed up today?"
})
# Agent thinks: "I need to query the database"
# Action: query_database
# Action Input: SELECT COUNT(*) FROM users WHERE DATE(created_at) = DATE('now')
# Observation: [(47,)]
# Final Answer: 47 users signed up today.
Part 3: Autonomous Agents
An agent is an LLM that can reason, plan, and use tools autonomously to complete complex, multi-step tasks.
# Multi-agent system with CrewAI (all local)
# pip install crewai crewai-tools
from crewai import Agent, Task, Crew
from langchain_community.llms import Ollama
llm = Ollama(model="gemma4:12b")
# ── Define specialized agents ──────────────────
researcher = Agent(
role="Research Analyst",
goal="Find and analyze relevant information",
backstory="Expert at researching topics and summarizing findings",
llm=llm,
verbose=True,
)
writer = Agent(
role="Technical Writer",
goal="Write clear, engaging technical content",
backstory="Expert at turning research into readable articles",
llm=llm,
verbose=True,
)
reviewer = Agent(
role="Editor",
goal="Review content for accuracy and clarity",
backstory="Experienced editor with attention to detail",
llm=llm,
verbose=True,
)
# ── Define tasks ───────────────────────────────
research_task = Task(
description="Research the key differences between gRPC and REST APIs. Include performance data.",
expected_output="A structured research document with key findings",
agent=researcher,
)
writing_task = Task(
description="Write a 500-word blog post based on the research. Make it beginner-friendly.",
expected_output="A polished blog post in markdown format",
agent=writer,
context=[research_task], # Uses research output as input
)
review_task = Task(
description="Review the blog post for technical accuracy and clarity. Suggest improvements.",
expected_output="Reviewed blog post with corrections applied",
agent=reviewer,
context=[writing_task],
)
# ── Run the crew ───────────────────────────────
crew = Crew(
agents=[researcher, writer, reviewer],
tasks=[research_task, writing_task, review_task],
verbose=True,
)
result = crew.kickoff()
print(result)
# Three AI agents collaborate to research, write, and review a blog post
# All running locally on your machine!
Privacy & Security Benefits
Performance Tips
# 1. Use a dedicated embedding model (smaller, faster)
ollama pull nomic-embed-text # 274 MB, very fast embeddings
# Don't use gemma4:12b for embeddings — it's overkill
# 2. Persist your vector store
vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)
# Second run loads instantly — no re-embedding
# 3. Tune chunk size for your data type
# Code: chunk_size=500, overlap=50 (functions are short)
# Legal docs: chunk_size=2000, overlap=400 (context matters)
# Chat logs: chunk_size=300, overlap=0 (each message is independent)
# 4. Use GPU for both embedding and generation
OLLAMA_NUM_GPU=99 ollama serve # Max GPU layers
# 5. Cache LLM responses for repeated queries
from langchain.cache import SQLiteCache
from langchain.globals import set_llm_cache
set_llm_cache(SQLiteCache(database_path=".langchain.db"))
Local AI isn't just for chatbots anymore. With RAG, your LLM answers questions about your data. With function calling, it takes actions in the real world. With agents, it plans and executes multi-step workflows autonomously. All of this runs on your laptop, costs nothing after setup, and keeps your data completely private. The local AI stack is production-ready — start building.