| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322 |
- # services/attribute_extractor.py
- import re
- import spacy
- from typing import Dict, List, Optional
- from anthropic import Anthropic
- import base64
- from PIL import Image
- import pytesseract
- from collections import defaultdict
- class HybridAttributeExtractor:
- """
- Hybrid extractor using NLP for structured data and LLM for complex/ambiguous cases
- """
-
- def __init__(self, anthropic_api_key: str, product_type_mappings: Dict = None):
- self.nlp = spacy.load("en_core_web_sm")
- self.client = Anthropic(api_key=anthropic_api_key)
- self.product_type_mappings = product_type_mappings or self._load_default_mappings()
-
- # Define patterns for common attributes
- self.patterns = {
- 'size': [
- r'\b(XXS|XS|S|M|L|XL|XXL|XXXL)\b',
- r'\b(\d+(?:\.\d+)?)\s*(inch|inches|cm|mm|meter|metres?|ft|feet|")\b',
- r'\b(small|medium|large|extra large)\b'
- ],
- 'color': [
- r'\b(black|white|red|blue|green|yellow|orange|purple|pink|brown|gray|grey|silver|gold|beige|navy|maroon|olive|teal|turquoise|lavender|cream|ivory)\b'
- ],
- 'weight': [
- r'\b(\d+(?:\.\d+)?)\s*(kg|g|lb|lbs|oz|pounds?|grams?|kilograms?)\b'
- ],
- 'material': [
- r'\b(cotton|polyester|silk|wool|leather|denim|linen|nylon|spandex|rayon|acrylic|metal|plastic|wood|glass|ceramic|steel|aluminum|rubber)\b'
- ],
- 'brand': [
- r'(?:by|from|brand:?)\s+([A-Z][a-zA-Z0-9\s&]+?)(?:\s|$|,|\.|;)'
- ]
- }
-
- # Confidence thresholds
- self.confidence_threshold = 0.6
-
- def extract_attributes(self, product_data: Dict) -> Dict:
- """
- Main extraction method - uses NLP first, LLM for gaps
- """
- # Phase 1: Quick NLP extraction
- nlp_attributes = self._extract_with_nlp(
- product_data.get('title', ''),
- product_data.get('description', '')
- )
-
- # Phase 2: OCR from images if provided
- ocr_text = ""
- if product_data.get('images'):
- ocr_text = self._extract_text_from_images(product_data['images'])
- if ocr_text:
- ocr_attributes = self._extract_with_nlp("", ocr_text)
- nlp_attributes = self._merge_attributes(nlp_attributes, ocr_attributes)
-
- # Phase 3: Always call LLM to enrich and validate NLP results
- llm_attributes = self._extract_with_llm(
- product_data,
- nlp_attributes,
- ocr_text
- )
- final_attributes = self._merge_attributes(nlp_attributes, llm_attributes)
-
- return final_attributes
-
- def _extract_with_nlp(self, title: str, description: str) -> Dict:
- """
- Fast extraction using regex and spaCy
- """
- text = f"{title} {description}".lower()
- attributes = defaultdict(list)
-
- # Pattern matching for structured attributes
- for attr_type, patterns in self.patterns.items():
- for pattern in patterns:
- matches = re.finditer(pattern, text, re.IGNORECASE)
- for match in matches:
- value = match.group(1) if match.groups() else match.group(0)
- attributes[attr_type].append(value.strip())
-
- # Named Entity Recognition for brands, organizations
- doc = self.nlp(title + " " + description)
- for ent in doc.ents:
- if ent.label_ == "ORG" and 'brand' not in attributes:
- attributes['brand'].append(ent.text)
- elif ent.label_ == "PRODUCT":
- attributes['product_type'].append(ent.text)
- elif ent.label_ == "MONEY":
- attributes['price'].append(ent.text)
-
- # Deduplicate and clean
- cleaned_attributes = {}
- for key, values in attributes.items():
- if values:
- # Take most common or first occurrence
- cleaned_attributes[key] = list(set(values))[0] if len(set(values)) == 1 else values
- cleaned_attributes[f'{key}_confidence'] = 0.8 if len(set(values)) == 1 else 0.5
-
- return cleaned_attributes
-
- def _extract_text_from_images(self, image_paths: List[str]) -> str:
- """
- Extract text from product images using OCR
- """
- extracted_text = []
-
- for img_path in image_paths[:3]: # Limit to 3 images
- try:
- img = Image.open(img_path)
- text = pytesseract.image_to_string(img)
- if text.strip():
- extracted_text.append(text.strip())
- except Exception as e:
- print(f"OCR error for {img_path}: {e}")
-
- return " ".join(extracted_text)
-
- def _needs_llm_extraction(self, attributes: Dict, product_data: Dict) -> bool:
- """
- Determine if LLM extraction is needed based on confidence and completeness
- """
- # Check if critical attributes are missing
- critical_attrs = ['category', 'brand', 'color', 'size']
- missing_critical = any(attr not in attributes for attr in critical_attrs)
-
- # Check confidence levels
- low_confidence = any(
- attributes.get(f'{key}_confidence', 0) < self.confidence_threshold
- for key in attributes.keys() if not key.endswith('_confidence')
- )
-
- # Check if description is complex/unstructured
- description = product_data.get('description', '')
- is_complex = len(description.split()) > 100 or 'features' in description.lower()
-
- return missing_critical or low_confidence or is_complex
-
- def _extract_with_llm(self, product_data: Dict, existing_attrs: Dict, ocr_text: str) -> Dict:
- """
- Use LLM to extract comprehensive attributes and validate NLP results
- """
- prompt = f"""Analyze this product and extract ALL possible attributes with high accuracy.
- Title: {product_data.get('title', 'N/A')}
- Description: {product_data.get('description', 'N/A')}
- Short Description: {product_data.get('short_description', 'N/A')}
- Text from images (OCR): {ocr_text if ocr_text else 'N/A'}
- NLP Pre-extracted attributes (validate and enhance): {existing_attrs}
- Extract a comprehensive JSON object with these fields (include all that apply):
- **Basic Info:**
- - category: specific product category/type
- - subcategory: more specific classification
- - brand: brand name
- - model: model number/name
- - product_line: product series/collection
- **Physical Attributes:**
- - color: all colors (list if multiple)
- - size: size information (with units)
- - dimensions: length/width/height with units
- - weight: weight with units
- - material: materials used (list all)
- - finish: surface finish/texture
- **Technical Specs (if applicable):**
- - specifications: key technical specs as object
- - compatibility: what it works with
- - capacity: storage/volume capacity
- - power: power requirements/battery info
- **Commercial Info:**
- - condition: new/used/refurbished
- - warranty: warranty information
- - country_of_origin: manufacturing country
- - certifications: safety/quality certifications
- **Descriptive:**
- - key_features: list of 5-8 main features
- - benefits: main benefits/use cases
- - target_audience: who this is for
- - usage_instructions: how to use (if mentioned)
- - care_instructions: care/maintenance info
- - style: style/aesthetic (modern, vintage, etc)
- - season: seasonal relevance (if applicable)
- - occasion: suitable occasions (if applicable)
- **Additional:**
- - package_contents: what's included
- - variants: available variants/options
- - tags: relevant search tags (list)
- Only include fields where you have high confidence. Use null for uncertain values.
- For lists, provide all relevant items. Be thorough and extract every possible detail."""
- content = [{"type": "text", "text": prompt}]
-
- # Add images if available
- if product_data.get('images'):
- for img_path in product_data['images'][:3]: # Include up to 3 images for better context
- try:
- with open(img_path, 'rb') as f:
- img_data = base64.b64encode(f.read()).decode()
-
- # Determine media type
- media_type = "image/jpeg"
- if img_path.lower().endswith('.png'):
- media_type = "image/png"
- elif img_path.lower().endswith('.webp'):
- media_type = "image/webp"
-
- content.append({
- "type": "image",
- "source": {
- "type": "base64",
- "media_type": media_type,
- "data": img_data
- }
- })
- except Exception as e:
- print(f"Error processing image {img_path}: {e}")
-
- try:
- response = self.client.messages.create(
- model="claude-sonnet-4-20250514",
- max_tokens=2048, # Increased for comprehensive extraction
- messages=[{"role": "user", "content": content}]
- )
-
- # Parse JSON response
- import json
- llm_result = json.loads(response.content[0].text)
-
- # Add high confidence to LLM results
- for key in llm_result:
- if llm_result[key] is not None:
- llm_result[f'{key}_confidence'] = 0.95
-
- return llm_result
-
- except Exception as e:
- print(f"LLM extraction error: {e}")
- return {}
-
- def _identify_missing_attributes(self, existing_attrs: Dict) -> List[str]:
- """
- Identify which attributes are missing or low confidence
- """
- important_attrs = ['category', 'brand', 'color', 'size', 'material', 'key_features']
- missing = []
-
- for attr in important_attrs:
- if attr not in existing_attrs or existing_attrs.get(f'{attr}_confidence', 0) < 0.7:
- missing.append(attr)
-
- return missing
-
- def _merge_attributes(self, base: Dict, additional: Dict) -> Dict:
- """
- Intelligently merge attributes, preferring LLM for new attributes and validation
- """
- merged = {}
-
- # Start with all NLP attributes
- for key, value in base.items():
- if not key.endswith('_confidence'):
- merged[key] = value
- merged[f'{key}_confidence'] = base.get(f'{key}_confidence', 0.7)
-
- # Add or override with LLM attributes
- for key, value in additional.items():
- if key.endswith('_confidence'):
- continue
-
- if value is None:
- # Keep NLP value if LLM returns null
- continue
-
- # LLM found new attribute or better value
- if key not in merged:
- merged[key] = value
- merged[f'{key}_confidence'] = additional.get(f'{key}_confidence', 0.95)
- else:
- # Compare values - if different, prefer LLM but mark for review
- llm_conf = additional.get(f'{key}_confidence', 0.95)
- nlp_conf = merged.get(f'{key}_confidence', 0.7)
-
- if str(value).lower() != str(merged[key]).lower():
- # Values differ - use LLM but add conflict flag
- merged[key] = value
- merged[f'{key}_confidence'] = llm_conf
- merged[f'{key}_nlp_value'] = base.get(key) # Store NLP value for reference
- merged[f'{key}_conflict'] = True
- else:
- # Values match - boost confidence
- merged[key] = value
- merged[f'{key}_confidence'] = min(0.99, (llm_conf + nlp_conf) / 2 + 0.1)
-
- return merged
- # Example usage
- if __name__ == "__main__":
- extractor = HybridAttributeExtractor(anthropic_api_key="your-api-key")
-
- product = {
- 'title': 'Nike Air Max 270 Running Shoes - Black/White',
- 'description': 'Premium running shoes with Max Air cushioning. Breathable mesh upper, rubber outsole. Perfect for daily training.',
- 'images': ['path/to/image1.jpg', 'path/to/image2.jpg']
- }
-
- attributes = extractor.extract_attributes(product)
- print(attributes)
|