Document Inventory Skill

Overview

This skill scans document collections (PDFs, Word docs, text files) and creates a structured inventory with metadata, automatic categorization, and collection statistics. Essential first step before building knowledge bases.

Quick Start

from pathlib import Path
import sqlite3

# Scan directory
documents = []
for filepath in Path("/path/to/docs").rglob("*.pdf"):
    documents.append({
        'filename': filepath.name,
        'size': filepath.stat().st_size,
        'path': str(filepath)
    })

# Store in database
conn = sqlite3.connect("inventory.db")
cursor = conn.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS docs (name TEXT, size INTEGER, path TEXT)")
for doc in documents:
    cursor.execute("INSERT INTO docs VALUES (?, ?, ?)",
                   (doc['filename'], doc['size'], doc['path']))
conn.commit()
print(f"Inventoried {len(documents)} documents")

When to Use

Auditing large document libraries before processing
Understanding the scope of a document collection
Categorizing documents by type, source, or content
Preparing inventories for knowledge base creation
Generating reports on document collections
Identifying duplicates or organizing files

Features

Recursive scanning - Process nested directories
Metadata extraction - Size, dates, page counts
Auto-categorization - Pattern-based classification
Statistics generation - Collection summaries
SQLite storage - Queryable inventory database
Multiple formats - PDF, DOCX, TXT, and more

Implementation

Core Inventory Builder

#!/usr/bin/env python3
"""Document inventory builder."""

import sqlite3
import os
from pathlib import Path
from datetime import datetime
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class DocumentInventory:
    """Build and manage document inventories."""

    SUPPORTED_EXTENSIONS = {
        '.pdf': 'PDF',
        '.docx': 'Word',
        '.doc': 'Word',
        '.txt': 'Text',
        '.md': 'Markdown',
        '.xlsx': 'Excel',
        '.xls': 'Excel',
        '.pptx': 'PowerPoint',
        '.ppt': 'PowerPoint',
    }

    def __init__(self, db_path):
        self.db_path = db_path
        self.conn = sqlite3.connect(db_path, timeout=30)
        self._setup_tables()

    def _setup_tables(self):
        cursor = self.conn.cursor()

        cursor.execute('''
            CREATE TABLE IF NOT EXISTS documents (
                id INTEGER PRIMARY KEY,
                filename TEXT NOT NULL,
                filepath TEXT UNIQUE NOT NULL,
                extension TEXT,
                file_type TEXT,
                category TEXT,
                file_size INTEGER,
                created_date TEXT,
                modified_date TEXT,
                parent_dir TEXT,
                depth INTEGER,
                scanned_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        ''')

        cursor.execute('''
            CREATE INDEX IF NOT EXISTS idx_category ON documents(category)
        ''')

        cursor.execute('''
            CREATE INDEX IF NOT EXISTS idx_extension ON documents(extension)
        ''')

        self.conn.commit()

    def scan_directory(self, root_path):
        """Scan directory and build inventory."""
        root = Path(root_path).resolve()
        logger.info(f"Scanning: {root}")

        count = 0
        for filepath in root.rglob('*'):
            if filepath.is_file():
                ext = filepath.suffix.lower()
                if ext in self.SUPPORTED_EXTENSIONS:
                    self._add_document(filepath, root)
                    count += 1

                    if count % 500 == 0:
                        logger.info(f"Scanned {count} documents...")
                        self.conn.commit()

        self.conn.commit()
        logger.info(f"Scan complete: {count} documents found")
        return count

    def _add_document(self, filepath, root):
        """Add document to inventory."""
        cursor = self.conn.cursor()

        try:
            stat = filepath.stat()
            ext = filepath.suffix.lower()

            cursor.execute('''
                INSERT OR REPLACE INTO documents
                (filename, filepath, extension, file_type, category,
                 file_size, created_date, modified_date, parent_dir, depth)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            ''', (
                filepath.name,
                str(filepath),
                ext,
                self.SUPPORTED_EXTENSIONS.get(ext, 'Unknown'),
                self._categorize(filepath),
                stat.st_size,
                datetime.fromtimestamp(stat.st_ctime).isoformat(),
                datetime.fromtimestamp(stat.st_mtime).isoformat(),
                str(filepath.parent),
                len(filepath.relative_to(root).parts) - 1
            ))

        except Exception as e:
            logger.warning(f"Error adding {filepath}: {e}")

    def _categorize(self, filepath):
        """Auto-categorize document based on patterns."""
        name = filepath.name.upper()
        path_str = str(filepath).upper()

        # Industry standard patterns
        patterns = {
            'API': 'API',
            'ISO': 'ISO',
            'ASME': 'ASME',
            'DNV': 'DNV',
            'NORSOK': 'NORSOK',
            'BSI': 'BSI',
            'ASTM': 'ASTM',
            'AWS': 'AWS',
            'ABS': 'ABS',
            'AISC': 'AISC',
            'IEEE': 'IEEE',
        }

        for pattern, category in patterns.items():
            if pattern in name or pattern in path_str:
                return category

        # Path-based categorization
        path_categories = {
            'STANDARD': 'Standards',
            'SPEC': 'Specifications',
            'MANUAL': 'Manuals',
            'GUIDE': 'Guides',
            'REPORT': 'Reports',
            'DRAWING': 'Drawings',
            'PROCEDURE': 'Procedures',
        }

        for pattern, category in path_categories.items():
            if pattern in path_str:
                return category

        return 'Unknown'

    def get_statistics(self):
        """Get inventory statistics."""
        cursor = self.conn.cursor()

        stats = {}

        # Total count
        cursor.execute('SELECT COUNT(*) FROM documents')
        stats['total_documents'] = cursor.fetchone()[0]

        # Total size
        cursor.execute('SELECT SUM(file_size) FROM documents')
        total_bytes = cursor.fetchone()[0] or 0
        stats['total_size_mb'] = round(total_bytes / (1024 * 1024), 2)

        # By file type
        cursor.execute('''
            SELECT file_type, COUNT(*), SUM(file_size)
            FROM documents
            GROUP BY file_type
            ORDER BY COUNT(*) DESC
        ''')
        stats['by_type'] = {
            row[0]: {'count': row[1], 'size_mb': round((row[2] or 0) / 1024 / 1024, 2)}
            for row in cursor.fetchall()
        }

        # By category
        cursor.execute('''
            SELECT category, COUNT(*)
            FROM documents
            GROUP BY category
            ORDER BY COUNT(*) DESC
        ''')
        stats['by_category'] = dict(cursor.fetchall())

        # By extension
        cursor.execute('''
            SELECT extension, COUNT(*)
            FROM documents
            GROUP BY extension
            ORDER BY COUNT(*) DESC
        ''')
        stats['by_extension'] = dict(cursor.fetchall())

        return stats

    def search(self, query, category=None, file_type=None, limit=50):
        """Search inventory."""
        cursor = self.conn.cursor()

        sql = 'SELECT filename, filepath, category, file_size FROM documents WHERE 1=1'
        params = []

        if query:
            sql += ' AND filename LIKE ?'
            params.append(f'%{query}%')

        if category:
            sql += ' AND category = ?'
            params.append(category)

        if file_type:
            sql += ' AND file_type = ?'
            params.append(file_type)

        sql += ' ORDER BY filename LIMIT ?'
        params.append(limit)

        cursor.execute(sql, params)
        return cursor.fetchall()

    def export_csv(self, output_path):
        """Export inventory to CSV."""
        import csv

        cursor = self.conn.cursor()
        cursor.execute('SELECT * FROM documents')

        columns = [desc[0] for desc in cursor.description]

        with open(output_path, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(columns)
            writer.writerows(cursor.fetchall())

        logger.info(f"Exported to {output_path}")

CLI Interface

#!/usr/bin/env python3
"""Document Inventory CLI."""

import argparse
import json

def main():
    parser = argparse.ArgumentParser(description='Document Inventory Tool')
    subparsers = parser.add_subparsers(dest='command', help='Commands')

    # Scan command
    scan_parser = subparsers.add_parser('scan', help='Scan directory')
    scan_parser.add_argument('path', help='Directory to scan')
    scan_parser.add_argument('--db', default='inventory.db', help='Database path')

    # Stats command
    stats_parser = subparsers.add_parser('stats', help='Show statistics')
    stats_parser.add_argument('--db', default='inventory.db', help='Database path')
    stats_parser.add_argument('--json', action='store_true', help='Output as JSON')

    # Search command
    search_parser = subparsers.add_parser('search', help='Search inventory')
    search_parser.add_argument('query', help='Search query')
    search_parser.add_argument('--db', default='inventory.db', help='Database path')
    search_parser.add_argument('--category', help='Filter by category')
    search_parser.add_argument('--type', help='Filter by file type')

    # Export command
    export_parser = subparsers.add_parser('export', help='Export to CSV')
    export_parser.add_argument('output', help='Output CSV path')
    export_parser.add_argument('--db', default='inventory.db', help='Database path')

    args = parser.parse_args()

    if args.command == 'scan':
        inventory = DocumentInventory(args.db)
        count = inventory.scan_directory(args.path)
        print(f"\nScanned {count} documents")

        stats = inventory.get_statistics()
        print(f"Total size: {stats['total_size_mb']} MB")
        print(f"\nBy category:")
        for cat, count in list(stats['by_category'].items())[:10]:
            print(f"  {cat}: {count}")

    elif args.command == 'stats':
        inventory = DocumentInventory(args.db)
        stats = inventory.get_statistics()

        if args.json:
            print(json.dumps(stats, indent=2))
        else:
            print(f"Total Documents: {stats['total_documents']}")
            print(f"Total Size: {stats['total_size_mb']} MB")
            print(f"\nBy Type:")
            for t, data in stats['by_type'].items():
                print(f"  {t}: {data['count']} ({data['size_mb']} MB)")
            print(f"\nBy Category:")
            for cat, count in list(stats['by_category'].items())[:15]:
                print(f"  {cat}: {count}")

    elif args.command == 'search':
        inventory = DocumentInventory(args.db)
        results = inventory.search(
            args.query,
            category=args.category,
            file_type=args.type
        )

        print(f"Found {len(results)} results:\n")
        for filename, filepath, category, size in results:
            size_kb = size / 1024
            print(f"  [{category:10}] {filename} ({size_kb:.1f} KB)")

    elif args.command == 'export':
        inventory = DocumentInventory(args.db)
        inventory.export_csv(args.output)

    else:
        parser.print_help()


if __name__ == '__main__':
    main()

Report Generator

def generate_report(db_path, output_path):
    """Generate HTML inventory report."""
    inventory = DocumentInventory(db_path)
    stats = inventory.get_statistics()

    html = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <title>Document Inventory Report</title>
        <style>
            body {{ font-family: Arial, sans-serif; margin: 40px; }}
            h1 {{ color: #333; }}
            .stat-box {{ background: #f5f5f5; padding: 20px; margin: 10px 0; border-radius: 8px; }}
            .stat-value {{ font-size: 2em; font-weight: bold; color: #2196F3; }}
            table {{ border-collapse: collapse; width: 100%; margin: 20px 0; }}
            th, td {{ border: 1px solid #ddd; padding: 12px; text-align: left; }}
            th {{ background: #2196F3; color: white; }}
            tr:nth-child(even) {{ background: #f9f9f9; }}
        </style>
    </head>
    <body>
        <h1>Document Inventory Report</h1>
        <p>Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>

        <div class="stat-box">
            <div class="stat-value">{stats['total_documents']:,}</div>
            <div>Total Documents</div>
        </div>

        <div class="stat-box">
            <div class="stat-value">{stats['total_size_mb']:,.1f} MB</div>
            <div>Total Size</div>
        </div>

        <h2>By File Type</h2>
        <table>
            <tr><th>Type</th><th>Count</th><th>Size (MB)</th></tr>
            {''.join(f"<tr><td>{t}</td><td>{d['count']}</td><td>{d['size_mb']}</td></tr>"
                     for t, d in stats['by_type'].items())}
        </table>

        <h2>By Category</h2>
        <table>
            <tr><th>Category</th><th>Count</th></tr>
            {''.join(f"<tr><td>{c}</td><td>{n}</td></tr>"
                     for c, n in stats['by_category'].items())}
        </table>
    </body>
    </html>
    """

    with open(output_path, 'w') as f:
        f.write(html)

    print(f"Report generated: {output_path}")

Custom Categorization

Extend with Your Patterns

# Add custom patterns for your domain
CUSTOM_PATTERNS = {
    'SPEC': 'Specifications',
    'DWG': 'Drawings',
    'REV': 'Revisions',
    'APPROVED': 'Approved',
    'DRAFT': 'Draft',
    'SUPERSEDED': 'Superseded',
}

def categorize_custom(filepath):
    name = filepath.name.upper()
    for pattern, category in CUSTOM_PATTERNS.items():
        if pattern in name:
            return category
    return 'Uncategorized'

Multi-Level Categories

def categorize_hierarchical(filepath):
    """Create hierarchical categories."""
    name = filepath.name.upper()

    # Primary category
    primary = 'General'
    if 'API' in name:
        primary = 'API Standards'
    elif 'ISO' in name:
        primary = 'ISO Standards'

    # Secondary category
    secondary = 'Other'
    if 'DESIGN' in name:
        secondary = 'Design'
    elif 'SAFETY' in name:
        secondary = 'Safety'
    elif 'QUALITY' in name:
        secondary = 'Quality'

    return f"{primary}/{secondary}"

Execution Checklist

Identify target directory for scanning
Create SQLite database for inventory
Run initial scan and review results
Customize categorization patterns if needed
Generate statistics report
Export to CSV for review
Generate HTML report for stakeholders
Plan next steps (knowledge base creation)

Error Handling

Common Errors

Error: PermissionError

Cause: Insufficient permissions to read files
Solution: Run with appropriate permissions or skip protected files

Error: sqlite3.OperationalError (database is locked)

Cause: Concurrent access without timeout
Solution: Use timeout=30 when connecting

Error: UnicodeDecodeError in filenames

Cause: Non-UTF8 characters in file paths
Solution: Use errors='replace' when processing paths

Error: OSError (too many open files)

Cause: Not closing file handles properly
Solution: Use context managers and batch commits

Error: Slow scanning on network drives

Cause: Network latency for each file access
Solution: Copy to local drive or use async scanning

Metrics

Metric	Typical Value
Scan speed (local)	~1000 files/second
Scan speed (network)	~100 files/second
Database size	~1KB per 10 documents
Memory usage	~50MB for 100K documents
Report generation	<1 second

Best Practices

Scan before processing - Always inventory first
Use SQLite timeout - timeout=30 for concurrent access
Batch commits - Commit every 500 files
Handle errors gracefully - Log and continue on failures
Export for review - Generate CSV/HTML for stakeholders
Update incrementally - Use INSERT OR REPLACE

Example Usage

# Scan directory
python inventory.py scan /path/to/documents --db inventory.db

# View statistics
python inventory.py stats --db inventory.db

# Search
python inventory.py search "API" --category "Standards"

# Export to CSV
python inventory.py export inventory.csv --db inventory.db

Related Skills

knowledge-base-builder - Build searchable database after inventory
pdf-text-extractor - Extract text from inventoried PDFs
semantic-search-setup - Add AI search capabilities

Dependencies

# No external dependencies - uses Python standard library
# Optional: pandas for advanced data manipulation
pip install pandas

Version History

1.1.0 (2026-01-02): Added Quick Start, Execution Checklist, Error Handling, Metrics sections; updated frontmatter with version, category, related_skills
1.0.0 (2024-10-15): Initial release with SQLite storage, auto-categorization, CLI interface

document-inventory