|
|
@@ -0,0 +1,322 @@
|
|
|
+# services/attribute_extractor.py
|
|
|
+import re
|
|
|
+import spacy
|
|
|
+from typing import Dict, List, Optional
|
|
|
+from anthropic import Anthropic
|
|
|
+import base64
|
|
|
+from PIL import Image
|
|
|
+import pytesseract
|
|
|
+from collections import defaultdict
|
|
|
+
|
|
|
+class HybridAttributeExtractor:
|
|
|
+ """
|
|
|
+ Hybrid extractor using NLP for structured data and LLM for complex/ambiguous cases
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(self, anthropic_api_key: str, product_type_mappings: Dict = None):
|
|
|
+ self.nlp = spacy.load("en_core_web_sm")
|
|
|
+ self.client = Anthropic(api_key=anthropic_api_key)
|
|
|
+ self.product_type_mappings = product_type_mappings or self._load_default_mappings()
|
|
|
+
|
|
|
+ # Define patterns for common attributes
|
|
|
+ self.patterns = {
|
|
|
+ 'size': [
|
|
|
+ r'\b(XXS|XS|S|M|L|XL|XXL|XXXL)\b',
|
|
|
+ r'\b(\d+(?:\.\d+)?)\s*(inch|inches|cm|mm|meter|metres?|ft|feet|")\b',
|
|
|
+ r'\b(small|medium|large|extra large)\b'
|
|
|
+ ],
|
|
|
+ 'color': [
|
|
|
+ r'\b(black|white|red|blue|green|yellow|orange|purple|pink|brown|gray|grey|silver|gold|beige|navy|maroon|olive|teal|turquoise|lavender|cream|ivory)\b'
|
|
|
+ ],
|
|
|
+ 'weight': [
|
|
|
+ r'\b(\d+(?:\.\d+)?)\s*(kg|g|lb|lbs|oz|pounds?|grams?|kilograms?)\b'
|
|
|
+ ],
|
|
|
+ 'material': [
|
|
|
+ r'\b(cotton|polyester|silk|wool|leather|denim|linen|nylon|spandex|rayon|acrylic|metal|plastic|wood|glass|ceramic|steel|aluminum|rubber)\b'
|
|
|
+ ],
|
|
|
+ 'brand': [
|
|
|
+ r'(?:by|from|brand:?)\s+([A-Z][a-zA-Z0-9\s&]+?)(?:\s|$|,|\.|;)'
|
|
|
+ ]
|
|
|
+ }
|
|
|
+
|
|
|
+ # Confidence thresholds
|
|
|
+ self.confidence_threshold = 0.6
|
|
|
+
|
|
|
+ def extract_attributes(self, product_data: Dict) -> Dict:
|
|
|
+ """
|
|
|
+ Main extraction method - uses NLP first, LLM for gaps
|
|
|
+ """
|
|
|
+ # Phase 1: Quick NLP extraction
|
|
|
+ nlp_attributes = self._extract_with_nlp(
|
|
|
+ product_data.get('title', ''),
|
|
|
+ product_data.get('description', '')
|
|
|
+ )
|
|
|
+
|
|
|
+ # Phase 2: OCR from images if provided
|
|
|
+ ocr_text = ""
|
|
|
+ if product_data.get('images'):
|
|
|
+ ocr_text = self._extract_text_from_images(product_data['images'])
|
|
|
+ if ocr_text:
|
|
|
+ ocr_attributes = self._extract_with_nlp("", ocr_text)
|
|
|
+ nlp_attributes = self._merge_attributes(nlp_attributes, ocr_attributes)
|
|
|
+
|
|
|
+ # Phase 3: Always call LLM to enrich and validate NLP results
|
|
|
+ llm_attributes = self._extract_with_llm(
|
|
|
+ product_data,
|
|
|
+ nlp_attributes,
|
|
|
+ ocr_text
|
|
|
+ )
|
|
|
+ final_attributes = self._merge_attributes(nlp_attributes, llm_attributes)
|
|
|
+
|
|
|
+ return final_attributes
|
|
|
+
|
|
|
+ def _extract_with_nlp(self, title: str, description: str) -> Dict:
|
|
|
+ """
|
|
|
+ Fast extraction using regex and spaCy
|
|
|
+ """
|
|
|
+ text = f"{title} {description}".lower()
|
|
|
+ attributes = defaultdict(list)
|
|
|
+
|
|
|
+ # Pattern matching for structured attributes
|
|
|
+ for attr_type, patterns in self.patterns.items():
|
|
|
+ for pattern in patterns:
|
|
|
+ matches = re.finditer(pattern, text, re.IGNORECASE)
|
|
|
+ for match in matches:
|
|
|
+ value = match.group(1) if match.groups() else match.group(0)
|
|
|
+ attributes[attr_type].append(value.strip())
|
|
|
+
|
|
|
+ # Named Entity Recognition for brands, organizations
|
|
|
+ doc = self.nlp(title + " " + description)
|
|
|
+ for ent in doc.ents:
|
|
|
+ if ent.label_ == "ORG" and 'brand' not in attributes:
|
|
|
+ attributes['brand'].append(ent.text)
|
|
|
+ elif ent.label_ == "PRODUCT":
|
|
|
+ attributes['product_type'].append(ent.text)
|
|
|
+ elif ent.label_ == "MONEY":
|
|
|
+ attributes['price'].append(ent.text)
|
|
|
+
|
|
|
+ # Deduplicate and clean
|
|
|
+ cleaned_attributes = {}
|
|
|
+ for key, values in attributes.items():
|
|
|
+ if values:
|
|
|
+ # Take most common or first occurrence
|
|
|
+ cleaned_attributes[key] = list(set(values))[0] if len(set(values)) == 1 else values
|
|
|
+ cleaned_attributes[f'{key}_confidence'] = 0.8 if len(set(values)) == 1 else 0.5
|
|
|
+
|
|
|
+ return cleaned_attributes
|
|
|
+
|
|
|
+ def _extract_text_from_images(self, image_paths: List[str]) -> str:
|
|
|
+ """
|
|
|
+ Extract text from product images using OCR
|
|
|
+ """
|
|
|
+ extracted_text = []
|
|
|
+
|
|
|
+ for img_path in image_paths[:3]: # Limit to 3 images
|
|
|
+ try:
|
|
|
+ img = Image.open(img_path)
|
|
|
+ text = pytesseract.image_to_string(img)
|
|
|
+ if text.strip():
|
|
|
+ extracted_text.append(text.strip())
|
|
|
+ except Exception as e:
|
|
|
+ print(f"OCR error for {img_path}: {e}")
|
|
|
+
|
|
|
+ return " ".join(extracted_text)
|
|
|
+
|
|
|
+ def _needs_llm_extraction(self, attributes: Dict, product_data: Dict) -> bool:
|
|
|
+ """
|
|
|
+ Determine if LLM extraction is needed based on confidence and completeness
|
|
|
+ """
|
|
|
+ # Check if critical attributes are missing
|
|
|
+ critical_attrs = ['category', 'brand', 'color', 'size']
|
|
|
+ missing_critical = any(attr not in attributes for attr in critical_attrs)
|
|
|
+
|
|
|
+ # Check confidence levels
|
|
|
+ low_confidence = any(
|
|
|
+ attributes.get(f'{key}_confidence', 0) < self.confidence_threshold
|
|
|
+ for key in attributes.keys() if not key.endswith('_confidence')
|
|
|
+ )
|
|
|
+
|
|
|
+ # Check if description is complex/unstructured
|
|
|
+ description = product_data.get('description', '')
|
|
|
+ is_complex = len(description.split()) > 100 or 'features' in description.lower()
|
|
|
+
|
|
|
+ return missing_critical or low_confidence or is_complex
|
|
|
+
|
|
|
+ def _extract_with_llm(self, product_data: Dict, existing_attrs: Dict, ocr_text: str) -> Dict:
|
|
|
+ """
|
|
|
+ Use LLM to extract comprehensive attributes and validate NLP results
|
|
|
+ """
|
|
|
+ prompt = f"""Analyze this product and extract ALL possible attributes with high accuracy.
|
|
|
+
|
|
|
+Title: {product_data.get('title', 'N/A')}
|
|
|
+Description: {product_data.get('description', 'N/A')}
|
|
|
+Short Description: {product_data.get('short_description', 'N/A')}
|
|
|
+Text from images (OCR): {ocr_text if ocr_text else 'N/A'}
|
|
|
+
|
|
|
+NLP Pre-extracted attributes (validate and enhance): {existing_attrs}
|
|
|
+
|
|
|
+Extract a comprehensive JSON object with these fields (include all that apply):
|
|
|
+
|
|
|
+**Basic Info:**
|
|
|
+- category: specific product category/type
|
|
|
+- subcategory: more specific classification
|
|
|
+- brand: brand name
|
|
|
+- model: model number/name
|
|
|
+- product_line: product series/collection
|
|
|
+
|
|
|
+**Physical Attributes:**
|
|
|
+- color: all colors (list if multiple)
|
|
|
+- size: size information (with units)
|
|
|
+- dimensions: length/width/height with units
|
|
|
+- weight: weight with units
|
|
|
+- material: materials used (list all)
|
|
|
+- finish: surface finish/texture
|
|
|
+
|
|
|
+**Technical Specs (if applicable):**
|
|
|
+- specifications: key technical specs as object
|
|
|
+- compatibility: what it works with
|
|
|
+- capacity: storage/volume capacity
|
|
|
+- power: power requirements/battery info
|
|
|
+
|
|
|
+**Commercial Info:**
|
|
|
+- condition: new/used/refurbished
|
|
|
+- warranty: warranty information
|
|
|
+- country_of_origin: manufacturing country
|
|
|
+- certifications: safety/quality certifications
|
|
|
+
|
|
|
+**Descriptive:**
|
|
|
+- key_features: list of 5-8 main features
|
|
|
+- benefits: main benefits/use cases
|
|
|
+- target_audience: who this is for
|
|
|
+- usage_instructions: how to use (if mentioned)
|
|
|
+- care_instructions: care/maintenance info
|
|
|
+- style: style/aesthetic (modern, vintage, etc)
|
|
|
+- season: seasonal relevance (if applicable)
|
|
|
+- occasion: suitable occasions (if applicable)
|
|
|
+
|
|
|
+**Additional:**
|
|
|
+- package_contents: what's included
|
|
|
+- variants: available variants/options
|
|
|
+- tags: relevant search tags (list)
|
|
|
+
|
|
|
+Only include fields where you have high confidence. Use null for uncertain values.
|
|
|
+For lists, provide all relevant items. Be thorough and extract every possible detail."""
|
|
|
+
|
|
|
+ content = [{"type": "text", "text": prompt}]
|
|
|
+
|
|
|
+ # Add images if available
|
|
|
+ if product_data.get('images'):
|
|
|
+ for img_path in product_data['images'][:3]: # Include up to 3 images for better context
|
|
|
+ try:
|
|
|
+ with open(img_path, 'rb') as f:
|
|
|
+ img_data = base64.b64encode(f.read()).decode()
|
|
|
+
|
|
|
+ # Determine media type
|
|
|
+ media_type = "image/jpeg"
|
|
|
+ if img_path.lower().endswith('.png'):
|
|
|
+ media_type = "image/png"
|
|
|
+ elif img_path.lower().endswith('.webp'):
|
|
|
+ media_type = "image/webp"
|
|
|
+
|
|
|
+ content.append({
|
|
|
+ "type": "image",
|
|
|
+ "source": {
|
|
|
+ "type": "base64",
|
|
|
+ "media_type": media_type,
|
|
|
+ "data": img_data
|
|
|
+ }
|
|
|
+ })
|
|
|
+ except Exception as e:
|
|
|
+ print(f"Error processing image {img_path}: {e}")
|
|
|
+
|
|
|
+ try:
|
|
|
+ response = self.client.messages.create(
|
|
|
+ model="claude-sonnet-4-20250514",
|
|
|
+ max_tokens=2048, # Increased for comprehensive extraction
|
|
|
+ messages=[{"role": "user", "content": content}]
|
|
|
+ )
|
|
|
+
|
|
|
+ # Parse JSON response
|
|
|
+ import json
|
|
|
+ llm_result = json.loads(response.content[0].text)
|
|
|
+
|
|
|
+ # Add high confidence to LLM results
|
|
|
+ for key in llm_result:
|
|
|
+ if llm_result[key] is not None:
|
|
|
+ llm_result[f'{key}_confidence'] = 0.95
|
|
|
+
|
|
|
+ return llm_result
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ print(f"LLM extraction error: {e}")
|
|
|
+ return {}
|
|
|
+
|
|
|
+ def _identify_missing_attributes(self, existing_attrs: Dict) -> List[str]:
|
|
|
+ """
|
|
|
+ Identify which attributes are missing or low confidence
|
|
|
+ """
|
|
|
+ important_attrs = ['category', 'brand', 'color', 'size', 'material', 'key_features']
|
|
|
+ missing = []
|
|
|
+
|
|
|
+ for attr in important_attrs:
|
|
|
+ if attr not in existing_attrs or existing_attrs.get(f'{attr}_confidence', 0) < 0.7:
|
|
|
+ missing.append(attr)
|
|
|
+
|
|
|
+ return missing
|
|
|
+
|
|
|
+ def _merge_attributes(self, base: Dict, additional: Dict) -> Dict:
|
|
|
+ """
|
|
|
+ Intelligently merge attributes, preferring LLM for new attributes and validation
|
|
|
+ """
|
|
|
+ merged = {}
|
|
|
+
|
|
|
+ # Start with all NLP attributes
|
|
|
+ for key, value in base.items():
|
|
|
+ if not key.endswith('_confidence'):
|
|
|
+ merged[key] = value
|
|
|
+ merged[f'{key}_confidence'] = base.get(f'{key}_confidence', 0.7)
|
|
|
+
|
|
|
+ # Add or override with LLM attributes
|
|
|
+ for key, value in additional.items():
|
|
|
+ if key.endswith('_confidence'):
|
|
|
+ continue
|
|
|
+
|
|
|
+ if value is None:
|
|
|
+ # Keep NLP value if LLM returns null
|
|
|
+ continue
|
|
|
+
|
|
|
+ # LLM found new attribute or better value
|
|
|
+ if key not in merged:
|
|
|
+ merged[key] = value
|
|
|
+ merged[f'{key}_confidence'] = additional.get(f'{key}_confidence', 0.95)
|
|
|
+ else:
|
|
|
+ # Compare values - if different, prefer LLM but mark for review
|
|
|
+ llm_conf = additional.get(f'{key}_confidence', 0.95)
|
|
|
+ nlp_conf = merged.get(f'{key}_confidence', 0.7)
|
|
|
+
|
|
|
+ if str(value).lower() != str(merged[key]).lower():
|
|
|
+ # Values differ - use LLM but add conflict flag
|
|
|
+ merged[key] = value
|
|
|
+ merged[f'{key}_confidence'] = llm_conf
|
|
|
+ merged[f'{key}_nlp_value'] = base.get(key) # Store NLP value for reference
|
|
|
+ merged[f'{key}_conflict'] = True
|
|
|
+ else:
|
|
|
+ # Values match - boost confidence
|
|
|
+ merged[key] = value
|
|
|
+ merged[f'{key}_confidence'] = min(0.99, (llm_conf + nlp_conf) / 2 + 0.1)
|
|
|
+
|
|
|
+ return merged
|
|
|
+
|
|
|
+
|
|
|
+# Example usage
|
|
|
+if __name__ == "__main__":
|
|
|
+ extractor = HybridAttributeExtractor(anthropic_api_key="your-api-key")
|
|
|
+
|
|
|
+ product = {
|
|
|
+ 'title': 'Nike Air Max 270 Running Shoes - Black/White',
|
|
|
+ 'description': 'Premium running shoes with Max Air cushioning. Breathable mesh upper, rubber outsole. Perfect for daily training.',
|
|
|
+ 'images': ['path/to/image1.jpg', 'path/to/image2.jpg']
|
|
|
+ }
|
|
|
+
|
|
|
+ attributes = extractor.extract_attributes(product)
|
|
|
+ print(attributes)
|