# services/attribute_extractor.py import re import spacy from typing import Dict, List, Optional from anthropic import Anthropic import base64 from PIL import Image import pytesseract from collections import defaultdict class HybridAttributeExtractor: """ Hybrid extractor using NLP for structured data and LLM for complex/ambiguous cases """ def __init__(self, anthropic_api_key: str, product_type_mappings: Dict = None): self.nlp = spacy.load("en_core_web_sm") self.client = Anthropic(api_key=anthropic_api_key) self.product_type_mappings = product_type_mappings or self._load_default_mappings() # Define patterns for common attributes self.patterns = { 'size': [ r'\b(XXS|XS|S|M|L|XL|XXL|XXXL)\b', r'\b(\d+(?:\.\d+)?)\s*(inch|inches|cm|mm|meter|metres?|ft|feet|")\b', r'\b(small|medium|large|extra large)\b' ], 'color': [ r'\b(black|white|red|blue|green|yellow|orange|purple|pink|brown|gray|grey|silver|gold|beige|navy|maroon|olive|teal|turquoise|lavender|cream|ivory)\b' ], 'weight': [ r'\b(\d+(?:\.\d+)?)\s*(kg|g|lb|lbs|oz|pounds?|grams?|kilograms?)\b' ], 'material': [ r'\b(cotton|polyester|silk|wool|leather|denim|linen|nylon|spandex|rayon|acrylic|metal|plastic|wood|glass|ceramic|steel|aluminum|rubber)\b' ], 'brand': [ r'(?:by|from|brand:?)\s+([A-Z][a-zA-Z0-9\s&]+?)(?:\s|$|,|\.|;)' ] } # Confidence thresholds self.confidence_threshold = 0.6 def extract_attributes(self, product_data: Dict) -> Dict: """ Main extraction method - uses NLP first, LLM for gaps """ # Phase 1: Quick NLP extraction nlp_attributes = self._extract_with_nlp( product_data.get('title', ''), product_data.get('description', '') ) # Phase 2: OCR from images if provided ocr_text = "" if product_data.get('images'): ocr_text = self._extract_text_from_images(product_data['images']) if ocr_text: ocr_attributes = self._extract_with_nlp("", ocr_text) nlp_attributes = self._merge_attributes(nlp_attributes, ocr_attributes) # Phase 3: Always call LLM to enrich and validate NLP results llm_attributes = self._extract_with_llm( product_data, nlp_attributes, ocr_text ) final_attributes = self._merge_attributes(nlp_attributes, llm_attributes) return final_attributes def _extract_with_nlp(self, title: str, description: str) -> Dict: """ Fast extraction using regex and spaCy """ text = f"{title} {description}".lower() attributes = defaultdict(list) # Pattern matching for structured attributes for attr_type, patterns in self.patterns.items(): for pattern in patterns: matches = re.finditer(pattern, text, re.IGNORECASE) for match in matches: value = match.group(1) if match.groups() else match.group(0) attributes[attr_type].append(value.strip()) # Named Entity Recognition for brands, organizations doc = self.nlp(title + " " + description) for ent in doc.ents: if ent.label_ == "ORG" and 'brand' not in attributes: attributes['brand'].append(ent.text) elif ent.label_ == "PRODUCT": attributes['product_type'].append(ent.text) elif ent.label_ == "MONEY": attributes['price'].append(ent.text) # Deduplicate and clean cleaned_attributes = {} for key, values in attributes.items(): if values: # Take most common or first occurrence cleaned_attributes[key] = list(set(values))[0] if len(set(values)) == 1 else values cleaned_attributes[f'{key}_confidence'] = 0.8 if len(set(values)) == 1 else 0.5 return cleaned_attributes def _extract_text_from_images(self, image_paths: List[str]) -> str: """ Extract text from product images using OCR """ extracted_text = [] for img_path in image_paths[:3]: # Limit to 3 images try: img = Image.open(img_path) text = pytesseract.image_to_string(img) if text.strip(): extracted_text.append(text.strip()) except Exception as e: print(f"OCR error for {img_path}: {e}") return " ".join(extracted_text) def _needs_llm_extraction(self, attributes: Dict, product_data: Dict) -> bool: """ Determine if LLM extraction is needed based on confidence and completeness """ # Check if critical attributes are missing critical_attrs = ['category', 'brand', 'color', 'size'] missing_critical = any(attr not in attributes for attr in critical_attrs) # Check confidence levels low_confidence = any( attributes.get(f'{key}_confidence', 0) < self.confidence_threshold for key in attributes.keys() if not key.endswith('_confidence') ) # Check if description is complex/unstructured description = product_data.get('description', '') is_complex = len(description.split()) > 100 or 'features' in description.lower() return missing_critical or low_confidence or is_complex def _extract_with_llm(self, product_data: Dict, existing_attrs: Dict, ocr_text: str) -> Dict: """ Use LLM to extract comprehensive attributes and validate NLP results """ prompt = f"""Analyze this product and extract ALL possible attributes with high accuracy. Title: {product_data.get('title', 'N/A')} Description: {product_data.get('description', 'N/A')} Short Description: {product_data.get('short_description', 'N/A')} Text from images (OCR): {ocr_text if ocr_text else 'N/A'} NLP Pre-extracted attributes (validate and enhance): {existing_attrs} Extract a comprehensive JSON object with these fields (include all that apply): **Basic Info:** - category: specific product category/type - subcategory: more specific classification - brand: brand name - model: model number/name - product_line: product series/collection **Physical Attributes:** - color: all colors (list if multiple) - size: size information (with units) - dimensions: length/width/height with units - weight: weight with units - material: materials used (list all) - finish: surface finish/texture **Technical Specs (if applicable):** - specifications: key technical specs as object - compatibility: what it works with - capacity: storage/volume capacity - power: power requirements/battery info **Commercial Info:** - condition: new/used/refurbished - warranty: warranty information - country_of_origin: manufacturing country - certifications: safety/quality certifications **Descriptive:** - key_features: list of 5-8 main features - benefits: main benefits/use cases - target_audience: who this is for - usage_instructions: how to use (if mentioned) - care_instructions: care/maintenance info - style: style/aesthetic (modern, vintage, etc) - season: seasonal relevance (if applicable) - occasion: suitable occasions (if applicable) **Additional:** - package_contents: what's included - variants: available variants/options - tags: relevant search tags (list) Only include fields where you have high confidence. Use null for uncertain values. For lists, provide all relevant items. Be thorough and extract every possible detail.""" content = [{"type": "text", "text": prompt}] # Add images if available if product_data.get('images'): for img_path in product_data['images'][:3]: # Include up to 3 images for better context try: with open(img_path, 'rb') as f: img_data = base64.b64encode(f.read()).decode() # Determine media type media_type = "image/jpeg" if img_path.lower().endswith('.png'): media_type = "image/png" elif img_path.lower().endswith('.webp'): media_type = "image/webp" content.append({ "type": "image", "source": { "type": "base64", "media_type": media_type, "data": img_data } }) except Exception as e: print(f"Error processing image {img_path}: {e}") try: response = self.client.messages.create( model="claude-sonnet-4-20250514", max_tokens=2048, # Increased for comprehensive extraction messages=[{"role": "user", "content": content}] ) # Parse JSON response import json llm_result = json.loads(response.content[0].text) # Add high confidence to LLM results for key in llm_result: if llm_result[key] is not None: llm_result[f'{key}_confidence'] = 0.95 return llm_result except Exception as e: print(f"LLM extraction error: {e}") return {} def _identify_missing_attributes(self, existing_attrs: Dict) -> List[str]: """ Identify which attributes are missing or low confidence """ important_attrs = ['category', 'brand', 'color', 'size', 'material', 'key_features'] missing = [] for attr in important_attrs: if attr not in existing_attrs or existing_attrs.get(f'{attr}_confidence', 0) < 0.7: missing.append(attr) return missing def _merge_attributes(self, base: Dict, additional: Dict) -> Dict: """ Intelligently merge attributes, preferring LLM for new attributes and validation """ merged = {} # Start with all NLP attributes for key, value in base.items(): if not key.endswith('_confidence'): merged[key] = value merged[f'{key}_confidence'] = base.get(f'{key}_confidence', 0.7) # Add or override with LLM attributes for key, value in additional.items(): if key.endswith('_confidence'): continue if value is None: # Keep NLP value if LLM returns null continue # LLM found new attribute or better value if key not in merged: merged[key] = value merged[f'{key}_confidence'] = additional.get(f'{key}_confidence', 0.95) else: # Compare values - if different, prefer LLM but mark for review llm_conf = additional.get(f'{key}_confidence', 0.95) nlp_conf = merged.get(f'{key}_confidence', 0.7) if str(value).lower() != str(merged[key]).lower(): # Values differ - use LLM but add conflict flag merged[key] = value merged[f'{key}_confidence'] = llm_conf merged[f'{key}_nlp_value'] = base.get(key) # Store NLP value for reference merged[f'{key}_conflict'] = True else: # Values match - boost confidence merged[key] = value merged[f'{key}_confidence'] = min(0.99, (llm_conf + nlp_conf) / 2 + 0.1) return merged # Example usage if __name__ == "__main__": extractor = HybridAttributeExtractor(anthropic_api_key="your-api-key") product = { 'title': 'Nike Air Max 270 Running Shoes - Black/White', 'description': 'Premium running shoes with Max Air cushioning. Breathable mesh upper, rubber outsole. Perfect for daily training.', 'images': ['path/to/image1.jpg', 'path/to/image2.jpg'] } attributes = extractor.extract_attributes(product) print(attributes)