Back to Skills
    🦞

    specification-extractor

    Extract structured data

    By @datadrivenconstruction
    View on GitHub
    SKILL.md
    ---
    slug: "specification-extractor"
    display_name: "Specification Extractor"
    description: "Extract structured data from construction specifications. Parse CSI sections, requirements, submittals, and product data from spec documents."
    ---
    
    # Specification Extractor for Construction
    
    ## Overview
    
    Extract structured data from construction specification documents. Parse CSI MasterFormat sections, identify requirements, submittals, product standards, and compile actionable data for estimating and procurement.
    
    ## Business Case
    
    Automated spec extraction enables:
    - **Faster Estimating**: Quickly identify scope and requirements
    - **Procurement Accuracy**: Extract exact product specifications
    - **Submittal Tracking**: Identify all required submittals
    - **Compliance Checking**: Verify specs against standards
    
    ## Technical Implementation
    
    ```python
    from dataclasses import dataclass, field
    from typing import List, Dict, Any, Optional
    import re
    import pdfplumber
    from pathlib import Path
    
    @dataclass
    class SpecSection:
        number: str  # e.g., "03 30 00"
        title: str
        part1_general: Dict[str, Any]
        part2_products: Dict[str, Any]
        part3_execution: Dict[str, Any]
        raw_text: str
    
    @dataclass
    class ProductRequirement:
        section: str
        manufacturer: str
        product_name: str
        model: str
        standards: List[str]
        properties: Dict[str, str]
    
    @dataclass
    class SubmittalRequirement:
        section: str
        submittal_type: str  # shop drawings, samples, product data, etc.
        description: str
        timing: str
        copies: int
    
    @dataclass
    class SpecExtractionResult:
        document_name: str
        total_pages: int
        sections: List[SpecSection]
        products: List[ProductRequirement]
        submittals: List[SubmittalRequirement]
        standards_referenced: List[str]
    
    class SpecificationExtractor:
        """Extract structured data from construction specifications."""
    
        # CSI MasterFormat patterns
        CSI_SECTION_PATTERN = r'^(\d{2}\s?\d{2}\s?\d{2})\s*[-–]\s*(.+?)
    #x27; PART_PATTERN = r'^PART\s+(\d+)\s*[-–]\s*(.+?)
    #x27; ARTICLE_PATTERN = r'^(\d+\.\d+)\s+([A-Z][A-Z\s]+)
    #x27; # Submittal type keywords SUBMITTAL_TYPES = { 'shop drawings': 'Shop Drawings', 'product data': 'Product Data', 'samples': 'Samples', 'certificates': 'Certificates', 'test reports': 'Test Reports', 'manufacturer instructions': 'Manufacturer Instructions', 'warranty': 'Warranty', 'maintenance data': 'Maintenance Data', 'mock-ups': 'Mock-ups', } # Common standard organizations STANDARD_PATTERNS = [ r'ASTM\s+[A-Z]\d+', r'ANSI\s+[A-Z]?\d+', r'ACI\s+\d+', r'AISC\s+\d+', r'AWS\s+[A-Z]\d+', r'ASCE\s+\d+', r'UL\s+\d+', r'FM\s+\d+', r'NFPA\s+\d+', r'IBC\s+\d+', ] def __init__(self): self.sections: Dict[str, SpecSection] = {} def extract_from_pdf(self, pdf_path: str) -> SpecExtractionResult: """Extract specification data from PDF.""" path = Path(pdf_path) all_text = "" page_count = 0 with pdfplumber.open(pdf_path) as pdf: page_count = len(pdf.pages) for page in pdf.pages: text = page.extract_text() or "" all_text += text + "\n\n" # Parse sections sections = self._parse_sections(all_text) # Extract products products = self._extract_products(sections) # Extract submittals submittals = self._extract_submittals(sections) # Extract standards standards = self._extract_standards(all_text) return SpecExtractionResult( document_name=path.name, total_pages=page_count, sections=sections, products=products, submittals=submittals, standards_referenced=standards ) def _parse_sections(self, text: str) -> List[SpecSection]: """Parse CSI sections from specification text.""" sections = [] lines = text.split('\n') current_section = None current_part = None current_content = [] for line in lines: line = line.strip() if not line: continue # Check for section header section_match = re.match(self.CSI_SECTION_PATTERN, line, re.IGNORECASE) if section_match: # Save previous section if current_section: sections.append(self._finalize_section(current_section, current_content)) current_section = { 'number': section_match.group(1).replace(' ', ''), 'title': section_match.group(2).strip(), 'parts': {} } current_content = [] current_part = None continue # Check for part header part_match = re.match(self.PART_PATTERN, line, re.IGNORECASE) if part_match and current_section: part_num = part_match.group(1) part_name = part_match.group(2).strip() current_part = f"part{part_num}" current_section['parts'][current_part] = { 'name': part_name, 'content': [] } continue # Add content to current part if current_section and current_part: current_section['parts'][current_part]['content'].append(line) elif current_section: current_content.append(line) # Save last section if current_section: sections.append(self._finalize_section(current_section, current_content)) return sections def _finalize_section(self, section_data: Dict, general_content: List[str]) -> SpecSection: """Finalize a section with parsed parts.""" parts = section_data.get('parts', {}) part1 = self._parse_part_content(parts.get('part1', {}).get('content', [])) part2 = self._parse_part_content(parts.get('part2', {}).get('content', [])) part3 = self._parse_part_content(parts.get('part3', {}).get('content', [])) return SpecSection( number=section_data['number'], title=section_data['title'], part1_general=part1, part2_products=part2, part3_execution=part3, raw_text='\n'.join(general_content) ) def _parse_part_content(self, content: List[str]) -> Dict[str, Any]: """Parse part content into structured data.""" result = { 'articles': {}, 'items': [] } current_article = None for line in content: # Check for article header article_match = re.match(self.ARTICLE_PATTERN, line) if article_match: current_article = article_match.group(1) result['articles'][current_article] = { 'title': article_match.group(2), 'items': [] } continue # Add to current article or general items if current_article and current_article in result['articles']: result['articles'][current_article]['items'].append(line) else: result['items'].append(line) return result def _extract_products(self, sections: List[SpecSection]) -> List[ProductRequirement]: """Extract product requirements from Part 2.""" products = [] for section in sections: part2 = section.part2_products for article_num, article in part2.get('articles', {}).items(): if 'MANUFACTURERS' in article['title'].upper(): for item in article['items']: # Extract manufacturer names if item.strip().startswith(('A.', 'B.', 'C.', '1.', '2.', '3.')): mfr_name = re.sub(r'^[A-Z\d]+\.\s*', '', item).strip() products.append(ProductRequirement( section=section.number, manufacturer=mfr_name, product_name='', model='', standards=[], properties={} )) elif 'MATERIALS' in article['title'].upper() or 'PRODUCTS' in article['title'].upper(): for item in article['items']: # Extract material requirements standards = self._extract_standards(item) if standards: products.append(ProductRequirement( section=section.number, manufacturer='', product_name=item[:100], model='', standards=standards, properties={} )) return products def _extract_submittals(self, sections: List[SpecSection]) -> List[SubmittalRequirement]: """Extract submittal requirements from Part 1.""" submittals = [] for section in sections: part1 = section.part1_general for article_num, article in part1.get('articles', {}).items(): if 'SUBMITTAL' in article['title'].upper(): for item in article['items']: ... (truncated)