skills/transilienceai/communitytools/html-content-analysis

html-content-analysis

SKILL.md

HTML Content Analysis Skill

Purpose

Parse HTML documents to extract technology signals from meta tags, generator comments, script URLs, CSS frameworks, and structural patterns.

Operations

1. extract_meta_generator

Find <meta name="generator"> tags.

Command (conceptual):

curl -s {url} | grep -oP '<meta[^>]*name=["\']generator["\'][^>]*content=["\'][^"\']+["\']'

Generator Patterns:

{
  "WordPress": {
    "pattern": "WordPress[\\s]?([\\d.]+)?",
    "tech": "WordPress",
    "extract_version": true,
    "confidence": 95
  },
  "Drupal": {
    "pattern": "Drupal[\\s]?([\\d.]+)?",
    "tech": "Drupal",
    "extract_version": true,
    "confidence": 95
  },
  "Joomla": {
    "pattern": "Joomla!?[\\s]?([\\d.]+)?",
    "tech": "Joomla",
    "extract_version": true,
    "confidence": 95
  },
  "Shopify": {
    "pattern": "Shopify",
    "tech": "Shopify",
    "confidence": 95
  },
  "Wix.com": {
    "pattern": "Wix\\.com",
    "tech": "Wix",
    "confidence": 95
  },
  "Squarespace": {
    "pattern": "Squarespace",
    "tech": "Squarespace",
    "confidence": 95
  },
  "Ghost": {
    "pattern": "Ghost[\\s]?([\\d.]+)?",
    "tech": "Ghost",
    "extract_version": true,
    "confidence": 95
  },
  "Hugo": {
    "pattern": "Hugo[\\s]?([\\d.]+)?",
    "tech": "Hugo",
    "extract_version": true,
    "confidence": 95
  },
  "Jekyll": {
    "pattern": "Jekyll[\\s]?([\\d.]+)?",
    "tech": "Jekyll",
    "extract_version": true,
    "confidence": 95
  },
  "Gatsby": {
    "pattern": "Gatsby[\\s]?([\\d.]+)?",
    "tech": "Gatsby",
    "extract_version": true,
    "implies": ["React"],
    "confidence": 95
  }
}

2. scan_html_comments

Look for technology hints in HTML comments.

Comment Patterns:

{
  "Powered by": {
    "pattern": "<!--[^>]*[Pp]owered by ([^>-]+)-->",
    "extract_group": 1,
    "confidence": 80
  },
  "Generated by": {
    "pattern": "<!--[^>]*[Gg]enerated by ([^>-]+)-->",
    "extract_group": 1,
    "confidence": 80
  },
  "Built with": {
    "pattern": "<!--[^>]*[Bb]uilt with ([^>-]+)-->",
    "extract_group": 1,
    "confidence": 75
  },
  "WordPress": {
    "pattern": "<!--[^>]*wp-content[^>]*-->",
    "tech": "WordPress",
    "confidence": 85
  },
  "Drupal": {
    "pattern": "<!--[^>]*drupal[^>]*-->",
    "tech": "Drupal",
    "confidence": 85
  },
  "Magento": {
    "pattern": "<!--[^>]*Magento[^>]*-->",
    "tech": "Magento",
    "confidence": 90
  }
}

3. analyze_script_urls

Identify framework-specific paths in script tags.

Script URL Patterns:

{
  "/wp-content/": {"tech": "WordPress", "confidence": 95},
  "/wp-includes/": {"tech": "WordPress", "confidence": 95},
  "/sites/default/files/": {"tech": "Drupal", "confidence": 90},
  "/misc/drupal.js": {"tech": "Drupal", "confidence": 95},
  "/_next/": {"tech": "Next.js", "confidence": 95, "implies": ["React"]},
  "/_nuxt/": {"tech": "Nuxt.js", "confidence": 95, "implies": ["Vue.js"]},
  "/static/js/main.": {"tech": "Create React App", "confidence": 85},
  "/assets/application-": {"tech": "Ruby on Rails", "confidence": 80},
  "/bundles/": {"tech": "ASP.NET", "confidence": 75},
  "/Scripts/": {"tech": "ASP.NET", "confidence": 75},
  "jquery": {"tech": "jQuery", "confidence": 90},
  "bootstrap": {"tech": "Bootstrap", "confidence": 90},
  "angular": {"tech": "Angular", "confidence": 85},
  "react": {"tech": "React", "confidence": 85},
  "vue": {"tech": "Vue.js", "confidence": 85}
}

4. detect_css_frameworks

Find CSS framework classes in HTML.

CSS Framework Patterns:

{
  "Bootstrap": {
    "classes": ["btn btn-", "container", "navbar", "row", "col-", "card", "modal"],
    "min_matches": 3,
    "confidence": 85
  },
  "Tailwind CSS": {
    "classes": ["bg-", "text-", "flex", "p-", "m-", "w-", "h-", "rounded-"],
    "min_matches": 5,
    "confidence": 85
  },
  "Foundation": {
    "classes": ["button", "callout", "top-bar", "grid-x", "cell"],
    "min_matches": 3,
    "confidence": 85
  },
  "Bulma": {
    "classes": ["button is-", "columns", "column", "hero", "box"],
    "min_matches": 3,
    "confidence": 85
  },
  "Material UI": {
    "classes": ["MuiButton", "MuiGrid", "MuiPaper", "MuiTypography"],
    "min_matches": 2,
    "confidence": 90
  },
  "Chakra UI": {
    "classes": ["chakra-", "css-"],
    "min_matches": 2,
    "confidence": 80
  },
  "Ant Design": {
    "classes": ["ant-btn", "ant-card", "ant-table", "ant-form"],
    "min_matches": 2,
    "confidence": 90
  }
}

5. extract_structured_data

Find JSON-LD and structured data.

Process:

  1. Find <script type="application/ld+json"> tags
  2. Parse JSON content
  3. Extract @type and properties
  4. Note e-commerce, organization, or product data

Structured Data Signals:

{
  "Product": {"indicates": "E-commerce site"},
  "Organization": {"indicates": "Business website"},
  "LocalBusiness": {"indicates": "Local business"},
  "Article": {"indicates": "Blog/News site"},
  "WebApplication": {"indicates": "Web app"}
}

Output

{
  "skill": "html_content_analysis",
  "domain": "string",
  "results": {
    "pages_analyzed": "number",
    "meta_generators": [
      {
        "url": "string",
        "generator": "WordPress 6.4.2",
        "tech": "WordPress",
        "version": "6.4.2",
        "confidence": 95
      }
    ],
    "html_comments": [
      {
        "url": "string",
        "comment": "Powered by Django",
        "tech": "Django",
        "confidence": 80
      }
    ],
    "script_analysis": {
      "urls_analyzed": "number",
      "frameworks_detected": [
        {
          "tech": "Next.js",
          "source": "/_next/ path pattern",
          "confidence": 95
        }
      ],
      "cdns_used": ["cdnjs.cloudflare.com", "unpkg.com"]
    },
    "css_frameworks": [
      {
        "name": "Bootstrap",
        "classes_matched": ["btn", "container", "navbar", "row"],
        "confidence": 85
      },
      {
        "name": "Tailwind CSS",
        "classes_matched": ["bg-white", "text-gray-500", "flex"],
        "confidence": 85
      }
    ],
    "structured_data": [
      {
        "type": "Organization",
        "url": "string",
        "signals": "Business website"
      }
    ],
    "technologies_summary": [
      {
        "name": "string",
        "category": "CMS|Framework|CSS|Library",
        "confidence": "number",
        "sources": ["array"]
      }
    ]
  },
  "evidence": [
    {
      "type": "meta_generator",
      "url": "string",
      "value": "string",
      "timestamp": "ISO-8601"
    },
    {
      "type": "script_url",
      "url": "string",
      "path": "string"
    }
  ]
}

Rate Limiting

  • Page fetches: 10/minute per domain
  • HTML parsing: No limit (local processing)

Error Handling

  • Malformed HTML: Use lenient parser
  • Large pages: Limit to first 5MB
  • Encoding issues: Detect and handle charset
  • Continue on parse errors

Security Considerations

  • Only fetch public pages
  • Do not execute scripts
  • Sanitize extracted content
  • Log all fetches for audit
Weekly Installs
4
GitHub Stars
67
First Seen
5 days ago
Installed on
opencode4
claude-code4
github-copilot4
codex4
amp4
cline4