llm-document-extraction
SKILL.md
LLM Document Extraction
Overview
Construction documents (RFIs, submittals, specs, contracts) contain critical data trapped in unstructured formats. This skill uses LLMs to extract structured data automatically.
"The construction industry is drowning in a flood of new data: the volume of information has grown from 15 zettabytes in 2015 to 181 zettabytes in 2025, and 90% of all existing data has been created in just the last few years." — Artem Boiko
Use Cases
| Document Type | Extract |
|---|---|
| RFI | Question, response, dates, parties |
| Submittal | Product specs, approval status, materials |
| Contract | Parties, amounts, dates, scope, clauses |
| Specification | Materials, standards, requirements |
| Daily Report | Weather, labor, equipment, progress |
Quick Start
from openai import OpenAI
import pdfplumber
import json
client = OpenAI()
def extract_from_pdf(pdf_path: str, extraction_schema: dict) -> dict:
"""Extract structured data from PDF using LLM"""
# Extract text from PDF
with pdfplumber.open(pdf_path) as pdf:
text = "\n".join(page.extract_text() for page in pdf.pages)
# Build extraction prompt
prompt = f"""
Extract the following information from this construction document.
Return ONLY valid JSON matching the schema.
Schema:
{json.dumps(extraction_schema, indent=2)}
Document:
{text[:8000]} # Truncate for context limits
JSON Output:
"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "You are a construction document analyst. Extract data accurately."},
{"role": "user", "content": prompt}
],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
Extraction Schemas
RFI Schema
rfi_schema = {
"rfi_number": "string",
"date_submitted": "YYYY-MM-DD",
"date_required": "YYYY-MM-DD",
"from_company": "string",
"to_company": "string",
"subject": "string",
"question": "string",
"response": "string or null",
"status": "open|closed|pending",
"cost_impact": "boolean",
"schedule_impact": "boolean",
"attachments": ["list of attachment names"]
}
# Extract
rfi_data = extract_from_pdf("RFI-0042.pdf", rfi_schema)
Submittal Schema
submittal_schema = {
"submittal_number": "string",
"spec_section": "string",
"description": "string",
"manufacturer": "string",
"product_name": "string",
"model_number": "string",
"submitted_by": "string",
"date_submitted": "YYYY-MM-DD",
"status": "approved|approved_as_noted|revise_resubmit|rejected",
"reviewer_comments": "string or null",
"materials": [
{
"name": "string",
"specification": "string",
"quantity": "string"
}
]
}
Contract Schema
contract_schema = {
"contract_number": "string",
"project_name": "string",
"owner": {
"name": "string",
"address": "string"
},
"contractor": {
"name": "string",
"address": "string"
},
"contract_amount": "number",
"start_date": "YYYY-MM-DD",
"completion_date": "YYYY-MM-DD",
"liquidated_damages": "number per day",
"retention_percentage": "number",
"key_clauses": [
{
"clause_number": "string",
"title": "string",
"summary": "string"
}
]
}
Batch Processing with n8n
{
"workflow": "Document Extraction Pipeline",
"trigger": "Watch folder for new PDFs",
"nodes": [
{
"name": "Read PDF",
"type": "Read Binary Files"
},
{
"name": "Classify Document",
"type": "AI Agent",
"prompt": "Classify this document: RFI, Submittal, Contract, Spec, or Other"
},
{
"name": "Route by Type",
"type": "Switch",
"rules": ["RFI", "Submittal", "Contract", "Spec"]
},
{
"name": "Extract RFI",
"type": "OpenAI",
"schema": "rfi_schema"
},
{
"name": "Extract Submittal",
"type": "OpenAI",
"schema": "submittal_schema"
},
{
"name": "Save to Database",
"type": "PostgreSQL",
"operation": "insert"
},
{
"name": "Update Dashboard",
"type": "HTTP Request",
"method": "POST"
}
]
}
Vision Model for Drawings
import base64
def extract_from_drawing(image_path: str, query: str) -> str:
"""Extract information from drawings using vision model"""
with open(image_path, "rb") as f:
image_data = base64.standard_b64encode(f.read()).decode()
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": query},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_data}"
}
}
]
}
]
)
return response.choices[0].message.content
# Example: Extract room areas from floor plan
areas = extract_from_drawing(
"floor_plan.png",
"List all rooms with their areas in square meters. Return as JSON."
)
RAG for Large Documents
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Qdrant
def create_document_index(pdf_path: str):
"""Create searchable index for large documents"""
# Load and split
loader = PyPDFLoader(pdf_path)
docs = loader.load()
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
chunks = splitter.split_documents(docs)
# Create vector store
embeddings = OpenAIEmbeddings()
vectorstore = Qdrant.from_documents(
chunks,
embeddings,
collection_name="contract_docs"
)
return vectorstore
def query_document(vectorstore, question: str) -> str:
"""Query document with RAG"""
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
docs = retriever.invoke(question)
context = "\n".join(doc.page_content for doc in docs)
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "Answer based on the contract excerpts provided."},
{"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}"}
]
)
return response.choices[0].message.content
# Usage
index = create_document_index("contract_100pages.pdf")
answer = query_document(index, "What are the liquidated damages terms?")
Requirements
pip install openai pdfplumber langchain langchain-openai qdrant-client
Resources
- OpenAI Vision: https://platform.openai.com/docs/guides/vision
- LangChain RAG: https://python.langchain.com/docs/tutorials/rag/
Weekly Installs
4
Repository
datadrivenconst…tructionGitHub Stars
51
First Seen
9 days ago
Security Audits
Installed on
opencode4
gemini-cli4
antigravity4
github-copilot4
codex4
kimi-cli4