llm-document-extraction

SKILL.md

LLM Document Extraction

Overview

Construction documents (RFIs, submittals, specs, contracts) contain critical data trapped in unstructured formats. This skill uses LLMs to extract structured data automatically.

"The construction industry is drowning in a flood of new data: the volume of information has grown from 15 zettabytes in 2015 to 181 zettabytes in 2025, and 90% of all existing data has been created in just the last few years." — Artem Boiko

Use Cases

Document Type Extract
RFI Question, response, dates, parties
Submittal Product specs, approval status, materials
Contract Parties, amounts, dates, scope, clauses
Specification Materials, standards, requirements
Daily Report Weather, labor, equipment, progress

Quick Start

from openai import OpenAI
import pdfplumber
import json

client = OpenAI()

def extract_from_pdf(pdf_path: str, extraction_schema: dict) -> dict:
    """Extract structured data from PDF using LLM"""

    # Extract text from PDF
    with pdfplumber.open(pdf_path) as pdf:
        text = "\n".join(page.extract_text() for page in pdf.pages)

    # Build extraction prompt
    prompt = f"""
    Extract the following information from this construction document.
    Return ONLY valid JSON matching the schema.

    Schema:
    {json.dumps(extraction_schema, indent=2)}

    Document:
    {text[:8000]}  # Truncate for context limits

    JSON Output:
    """

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a construction document analyst. Extract data accurately."},
            {"role": "user", "content": prompt}
        ],
        response_format={"type": "json_object"}
    )

    return json.loads(response.choices[0].message.content)

Extraction Schemas

RFI Schema

rfi_schema = {
    "rfi_number": "string",
    "date_submitted": "YYYY-MM-DD",
    "date_required": "YYYY-MM-DD",
    "from_company": "string",
    "to_company": "string",
    "subject": "string",
    "question": "string",
    "response": "string or null",
    "status": "open|closed|pending",
    "cost_impact": "boolean",
    "schedule_impact": "boolean",
    "attachments": ["list of attachment names"]
}

# Extract
rfi_data = extract_from_pdf("RFI-0042.pdf", rfi_schema)

Submittal Schema

submittal_schema = {
    "submittal_number": "string",
    "spec_section": "string",
    "description": "string",
    "manufacturer": "string",
    "product_name": "string",
    "model_number": "string",
    "submitted_by": "string",
    "date_submitted": "YYYY-MM-DD",
    "status": "approved|approved_as_noted|revise_resubmit|rejected",
    "reviewer_comments": "string or null",
    "materials": [
        {
            "name": "string",
            "specification": "string",
            "quantity": "string"
        }
    ]
}

Contract Schema

contract_schema = {
    "contract_number": "string",
    "project_name": "string",
    "owner": {
        "name": "string",
        "address": "string"
    },
    "contractor": {
        "name": "string",
        "address": "string"
    },
    "contract_amount": "number",
    "start_date": "YYYY-MM-DD",
    "completion_date": "YYYY-MM-DD",
    "liquidated_damages": "number per day",
    "retention_percentage": "number",
    "key_clauses": [
        {
            "clause_number": "string",
            "title": "string",
            "summary": "string"
        }
    ]
}

Batch Processing with n8n

{
  "workflow": "Document Extraction Pipeline",
  "trigger": "Watch folder for new PDFs",
  "nodes": [
    {
      "name": "Read PDF",
      "type": "Read Binary Files"
    },
    {
      "name": "Classify Document",
      "type": "AI Agent",
      "prompt": "Classify this document: RFI, Submittal, Contract, Spec, or Other"
    },
    {
      "name": "Route by Type",
      "type": "Switch",
      "rules": ["RFI", "Submittal", "Contract", "Spec"]
    },
    {
      "name": "Extract RFI",
      "type": "OpenAI",
      "schema": "rfi_schema"
    },
    {
      "name": "Extract Submittal",
      "type": "OpenAI",
      "schema": "submittal_schema"
    },
    {
      "name": "Save to Database",
      "type": "PostgreSQL",
      "operation": "insert"
    },
    {
      "name": "Update Dashboard",
      "type": "HTTP Request",
      "method": "POST"
    }
  ]
}

Vision Model for Drawings

import base64

def extract_from_drawing(image_path: str, query: str) -> str:
    """Extract information from drawings using vision model"""

    with open(image_path, "rb") as f:
        image_data = base64.standard_b64encode(f.read()).decode()

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": query},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{image_data}"
                        }
                    }
                ]
            }
        ]
    )

    return response.choices[0].message.content

# Example: Extract room areas from floor plan
areas = extract_from_drawing(
    "floor_plan.png",
    "List all rooms with their areas in square meters. Return as JSON."
)

RAG for Large Documents

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Qdrant

def create_document_index(pdf_path: str):
    """Create searchable index for large documents"""

    # Load and split
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    chunks = splitter.split_documents(docs)

    # Create vector store
    embeddings = OpenAIEmbeddings()
    vectorstore = Qdrant.from_documents(
        chunks,
        embeddings,
        collection_name="contract_docs"
    )

    return vectorstore

def query_document(vectorstore, question: str) -> str:
    """Query document with RAG"""

    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
    docs = retriever.invoke(question)

    context = "\n".join(doc.page_content for doc in docs)

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "Answer based on the contract excerpts provided."},
            {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}"}
        ]
    )

    return response.choices[0].message.content

# Usage
index = create_document_index("contract_100pages.pdf")
answer = query_document(index, "What are the liquidated damages terms?")

Requirements

pip install openai pdfplumber langchain langchain-openai qdrant-client

Resources

Weekly Installs
4
GitHub Stars
51
First Seen
9 days ago
Installed on
opencode4
gemini-cli4
antigravity4
github-copilot4
codex4
kimi-cli4