🦞
specification-extractor

Extract structured data
SKILL.md
---
slug: "specification-extractor"
display_name: "Specification Extractor"
description: "Extract structured data from construction specifications. Parse CSI sections, requirements, submittals, and product data from spec documents."
---

# Specification Extractor for Construction

## Overview

Extract structured data from construction specification documents. Parse CSI MasterFormat sections, identify requirements, submittals, product standards, and compile actionable data for estimating and procurement.

## Business Case

Automated spec extraction enables:
- **Faster Estimating**: Quickly identify scope and requirements
- **Procurement Accuracy**: Extract exact product specifications
- **Submittal Tracking**: Identify all required submittals
- **Compliance Checking**: Verify specs against standards

## Technical Implementation

```python
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional
import re
import pdfplumber
from pathlib import Path

@dataclass
class SpecSection:
    number: str  # e.g., "03 30 00"
    title: str
    part1_general: Dict[str, Any]
    part2_products: Dict[str, Any]
    part3_execution: Dict[str, Any]
    raw_text: str

@dataclass
class ProductRequirement:
    section: str
    manufacturer: str
    product_name: str
    model: str
    standards: List[str]
    properties: Dict[str, str]

@dataclass
class SubmittalRequirement:
    section: str
    submittal_type: str  # shop drawings, samples, product data, etc.
    description: str
    timing: str
    copies: int

@dataclass
class SpecExtractionResult:
    document_name: str
    total_pages: int
    sections: List[SpecSection]
    products: List[ProductRequirement]
    submittals: List[SubmittalRequirement]
    standards_referenced: List[str]

class SpecificationExtractor:
    """Extract structured data from construction specifications."""

    # CSI MasterFormat patterns
    CSI_SECTION_PATTERN = r'^(\d{2}\s?\d{2}\s?\d{2})\s*[-–]\s*(.+?)#x27;
    PART_PATTERN = r'^PART\s+(\d+)\s*[-–]\s*(.+?)#x27;
    ARTICLE_PATTERN = r'^(\d+\.\d+)\s+([A-Z][A-Z\s]+)#x27;

    # Submittal type keywords
    SUBMITTAL_TYPES = {
        'shop drawings': 'Shop Drawings',
        'product data': 'Product Data',
        'samples': 'Samples',
        'certificates': 'Certificates',
        'test reports': 'Test Reports',
        'manufacturer instructions': 'Manufacturer Instructions',
        'warranty': 'Warranty',
        'maintenance data': 'Maintenance Data',
        'mock-ups': 'Mock-ups',
    }

    # Common standard organizations
    STANDARD_PATTERNS = [
        r'ASTM\s+[A-Z]\d+',
        r'ANSI\s+[A-Z]?\d+',
        r'ACI\s+\d+',
        r'AISC\s+\d+',
        r'AWS\s+[A-Z]\d+',
        r'ASCE\s+\d+',
        r'UL\s+\d+',
        r'FM\s+\d+',
        r'NFPA\s+\d+',
        r'IBC\s+\d+',
    ]

    def __init__(self):
        self.sections: Dict[str, SpecSection] = {}

    def extract_from_pdf(self, pdf_path: str) -> SpecExtractionResult:
        """Extract specification data from PDF."""
        path = Path(pdf_path)

        all_text = ""
        page_count = 0

        with pdfplumber.open(pdf_path) as pdf:
            page_count = len(pdf.pages)
            for page in pdf.pages:
                text = page.extract_text() or ""
                all_text += text + "\n\n"

        # Parse sections
        sections = self._parse_sections(all_text)

        # Extract products
        products = self._extract_products(sections)

        # Extract submittals
        submittals = self._extract_submittals(sections)

        # Extract standards
        standards = self._extract_standards(all_text)

        return SpecExtractionResult(
            document_name=path.name,
            total_pages=page_count,
            sections=sections,
            products=products,
            submittals=submittals,
            standards_referenced=standards
        )

    def _parse_sections(self, text: str) -> List[SpecSection]:
        """Parse CSI sections from specification text."""
        sections = []
        lines = text.split('\n')

        current_section = None
        current_part = None
        current_content = []

        for line in lines:
            line = line.strip()
            if not line:
                continue

            # Check for section header
            section_match = re.match(self.CSI_SECTION_PATTERN, line, re.IGNORECASE)
            if section_match:
                # Save previous section
                if current_section:
                    sections.append(self._finalize_section(current_section, current_content))

                current_section = {
                    'number': section_match.group(1).replace(' ', ''),
                    'title': section_match.group(2).strip(),
                    'parts': {}
                }
                current_content = []
                current_part = None
                continue

            # Check for part header
            part_match = re.match(self.PART_PATTERN, line, re.IGNORECASE)
            if part_match and current_section:
                part_num = part_match.group(1)
                part_name = part_match.group(2).strip()
                current_part = f"part{part_num}"
                current_section['parts'][current_part] = {
                    'name': part_name,
                    'content': []
                }
                continue

            # Add content to current part
            if current_section and current_part:
                current_section['parts'][current_part]['content'].append(line)
            elif current_section:
                current_content.append(line)

        # Save last section
        if current_section:
            sections.append(self._finalize_section(current_section, current_content))

        return sections

    def _finalize_section(self, section_data: Dict, general_content: List[str]) -> SpecSection:
        """Finalize a section with parsed parts."""
        parts = section_data.get('parts', {})

        part1 = self._parse_part_content(parts.get('part1', {}).get('content', []))
        part2 = self._parse_part_content(parts.get('part2', {}).get('content', []))
        part3 = self._parse_part_content(parts.get('part3', {}).get('content', []))

        return SpecSection(
            number=section_data['number'],
            title=section_data['title'],
            part1_general=part1,
            part2_products=part2,
            part3_execution=part3,
            raw_text='\n'.join(general_content)
        )

    def _parse_part_content(self, content: List[str]) -> Dict[str, Any]:
        """Parse part content into structured data."""
        result = {
            'articles': {},
            'items': []
        }

        current_article = None

        for line in content:
            # Check for article header
            article_match = re.match(self.ARTICLE_PATTERN, line)
            if article_match:
                current_article = article_match.group(1)
                result['articles'][current_article] = {
                    'title': article_match.group(2),
                    'items': []
                }
                continue

            # Add to current article or general items
            if current_article and current_article in result['articles']:
                result['articles'][current_article]['items'].append(line)
            else:
                result['items'].append(line)

        return result

    def _extract_products(self, sections: List[SpecSection]) -> List[ProductRequirement]:
        """Extract product requirements from Part 2."""
        products = []

        for section in sections:
            part2 = section.part2_products

            for article_num, article in part2.get('articles', {}).items():
                if 'MANUFACTURERS' in article['title'].upper():
                    for item in article['items']:
                        # Extract manufacturer names
                        if item.strip().startswith(('A.', 'B.', 'C.', '1.', '2.', '3.')):
                            mfr_name = re.sub(r'^[A-Z\d]+\.\s*', '', item).strip()
                            products.append(ProductRequirement(
                                section=section.number,
                                manufacturer=mfr_name,
                                product_name='',
                                model='',
                                standards=[],
                                properties={}
                            ))

                elif 'MATERIALS' in article['title'].upper() or 'PRODUCTS' in article['title'].upper():
                    for item in article['items']:
                        # Extract material requirements
                        standards = self._extract_standards(item)
                        if standards:
                            products.append(ProductRequirement(
                                section=section.number,
                                manufacturer='',
                                product_name=item[:100],
                                model='',
                                standards=standards,
                                properties={}
                            ))

        return products

    def _extract_submittals(self, sections: List[SpecSection]) -> List[SubmittalRequirement]:
        """Extract submittal requirements from Part 1."""
        submittals = []

        for section in sections:
            part1 = section.part1_general

            for article_num, article in part1.get('articles', {}).items():
                if 'SUBMITTAL' in article['title'].upper():
                    for item in article['items']:
          

... (truncated)