data-extractor

Data Extractor Skill Overview This skill enables extraction of structured data from any document format using unstructured - a unified library for processing PDFs, Word docs, emails, HTML, and more. Get consistent, structured output regardless of input format. How to Use Provide the document to process Optionally specify extraction options I'll extract structured elements with metadata Example prompts: "Extract all text and tables from this PDF" "Parse this email and get the body, attachments, and metadata" "Convert this HTML page to structured elements" "Extract data from these mixed-format documents" Domain Knowledge unstructured Fundamentals from unstructured.partition.auto import partition # Automatically detect and process any document elements = partition("document.pdf") # Access extracted elements for element in elements: print(f"Type: {type(element).name}") print(f"Text: {element.text}") print(f"Metadata: {element.metadata}") Supported Formats Format Function Notes PDF partition_pdf Native + scanned Word partition_docx Full structure PowerPoint partition_pptx Slides & notes Excel partition_xlsx Sheets & tables Email partition_email Body & attachments HTML partition_html Tags preserved Markdown partition_md Structure preserved Plain Text partition_text Basic parsing Images partition_image OCR extraction Element Types from unstructured.documents.elements import ( Title, NarrativeText, Text, ListItem, Table, Image, Header, Footer, PageBreak, Address, EmailAddress, ) # Elements have consistent structure element.text # Raw text content element.metadata # Rich metadata element.category # Element type element.id # Unique identifier Auto Partition from unstructured.partition.auto import partition # Process any file type elements = partition( filename="document.pdf", strategy="auto", # or "fast", "hi_res", "ocr_only" include_metadata=True, include_page_breaks=True, ) # Filter by type titles = [e for e in elements if isinstance(e, Title)] tables = [e for e in elements if isinstance(e, Table)] Format-Specific Partitioning # PDF with options from unstructured.partition.pdf import partition_pdf elements = partition_pdf( filename="document.pdf", strategy="hi_res", # High quality extraction infer_table_structure=True, # Detect tables include_page_breaks=True, languages=["en"], # OCR language ) # Word documents from unstructured.partition.docx import partition_docx elements = partition_docx( filename="document.docx", include_metadata=True, ) # HTML from unstructured.partition.html import partition_html elements = partition_html( filename="page.html", include_metadata=True, ) Working with Tables from unstructured.partition.auto import partition elements = partition("report.pdf", infer_table_structure=True) # Extract tables for element in elements: if element.category == "Table": print("Table found:") print(element.text) # Access structured table data if hasattr(element, 'metadata') and element.metadata.text_as_html: print("HTML:", element.metadata.text_as_html) Metadata Access from unstructured.partition.auto import partition elements = partition("document.pdf") for element in elements: meta = element.metadata # Common metadata fields print(f"Page: {meta.page_number}") print(f"Filename: {meta.filename}") print(f"Filetype: {meta.filetype}") print(f"Coordinates: {meta.coordinates}") print(f"Languages: {meta.languages}") Chunking for AI/RAG from unstructured.partition.auto import partition from unstructured.chunking.title import chunk_by_title from unstructured.chunking.basic import chunk_elements # Partition document elements = partition("document.pdf") # Chunk by title (semantic chunks) chunks = chunk_by_title( elements, max_characters=1000, combine_text_under_n_chars=200, ) # Or basic chunking chunks = chunk_elements( elements, max_characters=500, overlap=50, ) for chunk in chunks: print(f"Chunk ({len(chunk.text)} chars):") print(chunk.text[:100] + "...") Batch Processing from unstructured.partition.auto import partition from pathlib import Path from concurrent.futures import ThreadPoolExecutor def process_document(file_path): """Process single document.""" try: elements = partition(str(file_path)) return { 'file': str(file_path), 'status': 'success', 'elements': len(elements), 'text': '\n\n'.join([e.text for e in elements]) } except Exception as e: return { 'file': str(file_path), 'status': 'error', 'error': str(e) } def batch_process(input_dir, max_workers=4): """Process all documents in directory.""" input_path = Path(input_dir) files = list(input_path.glob('')) with ThreadPoolExecutor(max_workers=max_workers) as executor: results = list(executor.map(process_document, files)) return results Export Formats from unstructured.partition.auto import partition from unstructured.staging.base import elements_to_json, elements_to_dicts elements = partition("document.pdf") # To JSON string json_str = elements_to_json(elements) # To list of dicts dicts = elements_to_dicts(elements) # To DataFrame import pandas as pd df = pd.DataFrame(dicts) Best Practices Choose Strategy Wisely: "fast" for speed, "hi_res" for accuracy Enable Table Detection: For documents with tables Specify Language: For better OCR on non-English docs Chunk for RAG: Use semantic chunking for AI applications Handle Errors: Some formats may fail gracefully Common Patterns Document to JSON def document_to_json(file_path, output_path=None): """Convert document to structured JSON.""" from unstructured.partition.auto import partition from unstructured.staging.base import elements_to_json import json elements = partition(file_path) # Create structured output output = { 'source': file_path, 'elements': [] } for element in elements: output['elements'].append({ 'type': type(element).name, 'text': element.text, 'metadata': { 'page': element.metadata.page_number, 'coordinates': element.metadata.coordinates.to_dict() if element.metadata.coordinates else None } }) if output_path: with open(output_path, 'w') as f: json.dump(output, f, indent=2) return output Email Parser from unstructured.partition.email import partition_email def parse_email(email_path): """Extract structured data from email.""" elements = partition_email(email_path) email_data = { 'subject': None, 'from': None, 'to': [], 'date': None, 'body': [], 'attachments': [] } for element in elements: meta = element.metadata # Extract headers from metadata if meta.subject: email_data['subject'] = meta.subject if meta.sent_from: email_data['from'] = meta.sent_from if meta.sent_to: email_data['to'] = meta.sent_to # Body content email_data['body'].append({ 'type': type(element).name, 'text': element.text }) return email_data Examples Example 1: Research Paper Extraction from unstructured.partition.pdf import partition_pdf from unstructured.chunking.title import chunk_by_title def extract_paper(pdf_path): """Extract structured data from research paper.""" elements = partition_pdf( filename=pdf_path, strategy="hi_res", infer_table_structure=True, include_page_breaks=True ) paper = { 'title': None, 'abstract': None, 'sections': [], 'tables': [], 'references': [] } # Find title (usually first Title element) for element in elements: if element.category == "Title" and not paper['title']: paper['title'] = element.text break # Extract tables for element in elements: if element.category == "Table": paper['tables'].append({ 'page': element.metadata.page_number, 'content': element.text, 'html': element.metadata.text_as_html if hasattr(element.metadata, 'text_as_html') else None }) # Chunk into sections chunks = chunk_by_title(elements, max_characters=2000) current_section = None for chunk in chunks: if chunk.category == "Title": paper['sections'].append({ 'title': chunk.text, 'content': '' }) elif paper['sections']: paper['sections'][-1]['content'] += chunk.text + '\n' return paper paper = extract_paper('research_paper.pdf') print(f"Title: {paper['title']}") print(f"Tables: {len(paper['tables'])}") print(f"Sections: {len(paper['sections'])}") Example 2: Invoice Data Extraction from unstructured.partition.auto import partition import re def extract_invoice_data(file_path): """Extract key data from invoice.""" elements = partition(file_path, strategy="hi_res") # Combine all text full_text = '\n'.join([e.text for e in elements]) invoice = { 'invoice_number': None, 'date': None, 'total': None, 'vendor': None, 'line_items': [], 'tables': [] } # Extract patterns inv_match = re.search(r'Invoice\s#?\s*:?\s*(\w+[-\w])', full_text, re.I) if inv_match: invoice['invoice_number'] = inv_match.group(1) date_match = re.search(r'Date\s:?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})', full_text, re.I) if date_match: invoice['date'] = date_match.group(1) total_match = re.search(r'Total\s*:?\s*$?([\d,]+.?\d*)', full_text, re.I) if total_match: invoice['total'] = float(total_match.group(1).replace(',', '')) # Extract tables for element in elements: if element.category == "Table": invoice['tables'].append(element.text) return invoice invoice = extract_invoice_data('invoice.pdf') print(f"Invoice #: {invoice['invoice_number']}") print(f"Total: ${invoice['total']}") Example 3: Document Corpus Builder from unstructured.partition.auto import partition from unstructured.chunking.title import chunk_by_title from pathlib import Path import json def build_corpus(input_dir, output_path): """Build searchable corpus from document collection.""" input_path = Path(input_dir) corpus = [] # Support multiple formats patterns = ['.pdf', '.docx', '.html', '.txt', '*.md'] files = [] for pattern in patterns: files.extend(input_path.glob(pattern)) for file in files: print(f"Processing: {file.name}") try: elements = partition(str(file)) chunks = chunk_by_title(elements, max_characters=1000) for i, chunk in enumerate(chunks): corpus.append({ 'id': f"{file.stem}_{i}", 'source': str(file), 'type': type(chunk).name, 'text': chunk.text, 'page': chunk.metadata.page_number if chunk.metadata.page_number else None }) except Exception as e: print(f" Error: {e}") # Save corpus with open(output_path, 'w') as f: json.dump(corpus, f, indent=2) print(f"Corpus built: {len(corpus)} chunks from {len(files)} files") return corpus corpus = build_corpus('./documents', 'corpus.json') Limitations Complex layouts may need manual review OCR quality depends on image quality Large files may need chunking Some proprietary formats not supported API rate limits for cloud processing Installation # Basic installation pip install unstructured # With all dependencies pip install "unstructured[all-docs]" # For PDF processing pip install "unstructured[pdf]" # For specific formats pip install "unstructured[docx,pptx,xlsx]" Resources unstructured GitHub Documentation Unstructured API Weekly Installs196Repositoryclaude-office-s…s/skillsGitHub Stars13First Seen13 days agoSecurity AuditsGen Agent Trust HubPassSocketPassSnykPassInstalled onclaude-code148opencode93github-copilot92gemini-cli90amp90cline90

SkillForge

data-extractor

Before / After 效果对比

description SKILL.md

data-extractor

forum用户评价 (0)

统计数据

用户评分

兼容平台

时间线

SkillForge