|
@@ -1,322 +0,0 @@
|
|
|
-# services/attribute_extractor.py
|
|
|
|
|
-import re
|
|
|
|
|
-import spacy
|
|
|
|
|
-from typing import Dict, List, Optional
|
|
|
|
|
-from anthropic import Anthropic
|
|
|
|
|
-import base64
|
|
|
|
|
-from PIL import Image
|
|
|
|
|
-import pytesseract
|
|
|
|
|
-from collections import defaultdict
|
|
|
|
|
-
|
|
|
|
|
-class HybridAttributeExtractor:
|
|
|
|
|
- """
|
|
|
|
|
- Hybrid extractor using NLP for structured data and LLM for complex/ambiguous cases
|
|
|
|
|
- """
|
|
|
|
|
-
|
|
|
|
|
- def __init__(self, anthropic_api_key: str, product_type_mappings: Dict = None):
|
|
|
|
|
- self.nlp = spacy.load("en_core_web_sm")
|
|
|
|
|
- self.client = Anthropic(api_key=anthropic_api_key)
|
|
|
|
|
- self.product_type_mappings = product_type_mappings or self._load_default_mappings()
|
|
|
|
|
-
|
|
|
|
|
- # Define patterns for common attributes
|
|
|
|
|
- self.patterns = {
|
|
|
|
|
- 'size': [
|
|
|
|
|
- r'\b(XXS|XS|S|M|L|XL|XXL|XXXL)\b',
|
|
|
|
|
- r'\b(\d+(?:\.\d+)?)\s*(inch|inches|cm|mm|meter|metres?|ft|feet|")\b',
|
|
|
|
|
- r'\b(small|medium|large|extra large)\b'
|
|
|
|
|
- ],
|
|
|
|
|
- 'color': [
|
|
|
|
|
- r'\b(black|white|red|blue|green|yellow|orange|purple|pink|brown|gray|grey|silver|gold|beige|navy|maroon|olive|teal|turquoise|lavender|cream|ivory)\b'
|
|
|
|
|
- ],
|
|
|
|
|
- 'weight': [
|
|
|
|
|
- r'\b(\d+(?:\.\d+)?)\s*(kg|g|lb|lbs|oz|pounds?|grams?|kilograms?)\b'
|
|
|
|
|
- ],
|
|
|
|
|
- 'material': [
|
|
|
|
|
- r'\b(cotton|polyester|silk|wool|leather|denim|linen|nylon|spandex|rayon|acrylic|metal|plastic|wood|glass|ceramic|steel|aluminum|rubber)\b'
|
|
|
|
|
- ],
|
|
|
|
|
- 'brand': [
|
|
|
|
|
- r'(?:by|from|brand:?)\s+([A-Z][a-zA-Z0-9\s&]+?)(?:\s|$|,|\.|;)'
|
|
|
|
|
- ]
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- # Confidence thresholds
|
|
|
|
|
- self.confidence_threshold = 0.6
|
|
|
|
|
-
|
|
|
|
|
- def extract_attributes(self, product_data: Dict) -> Dict:
|
|
|
|
|
- """
|
|
|
|
|
- Main extraction method - uses NLP first, LLM for gaps
|
|
|
|
|
- """
|
|
|
|
|
- # Phase 1: Quick NLP extraction
|
|
|
|
|
- nlp_attributes = self._extract_with_nlp(
|
|
|
|
|
- product_data.get('title', ''),
|
|
|
|
|
- product_data.get('description', '')
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- # Phase 2: OCR from images if provided
|
|
|
|
|
- ocr_text = ""
|
|
|
|
|
- if product_data.get('images'):
|
|
|
|
|
- ocr_text = self._extract_text_from_images(product_data['images'])
|
|
|
|
|
- if ocr_text:
|
|
|
|
|
- ocr_attributes = self._extract_with_nlp("", ocr_text)
|
|
|
|
|
- nlp_attributes = self._merge_attributes(nlp_attributes, ocr_attributes)
|
|
|
|
|
-
|
|
|
|
|
- # Phase 3: Always call LLM to enrich and validate NLP results
|
|
|
|
|
- llm_attributes = self._extract_with_llm(
|
|
|
|
|
- product_data,
|
|
|
|
|
- nlp_attributes,
|
|
|
|
|
- ocr_text
|
|
|
|
|
- )
|
|
|
|
|
- final_attributes = self._merge_attributes(nlp_attributes, llm_attributes)
|
|
|
|
|
-
|
|
|
|
|
- return final_attributes
|
|
|
|
|
-
|
|
|
|
|
- def _extract_with_nlp(self, title: str, description: str) -> Dict:
|
|
|
|
|
- """
|
|
|
|
|
- Fast extraction using regex and spaCy
|
|
|
|
|
- """
|
|
|
|
|
- text = f"{title} {description}".lower()
|
|
|
|
|
- attributes = defaultdict(list)
|
|
|
|
|
-
|
|
|
|
|
- # Pattern matching for structured attributes
|
|
|
|
|
- for attr_type, patterns in self.patterns.items():
|
|
|
|
|
- for pattern in patterns:
|
|
|
|
|
- matches = re.finditer(pattern, text, re.IGNORECASE)
|
|
|
|
|
- for match in matches:
|
|
|
|
|
- value = match.group(1) if match.groups() else match.group(0)
|
|
|
|
|
- attributes[attr_type].append(value.strip())
|
|
|
|
|
-
|
|
|
|
|
- # Named Entity Recognition for brands, organizations
|
|
|
|
|
- doc = self.nlp(title + " " + description)
|
|
|
|
|
- for ent in doc.ents:
|
|
|
|
|
- if ent.label_ == "ORG" and 'brand' not in attributes:
|
|
|
|
|
- attributes['brand'].append(ent.text)
|
|
|
|
|
- elif ent.label_ == "PRODUCT":
|
|
|
|
|
- attributes['product_type'].append(ent.text)
|
|
|
|
|
- elif ent.label_ == "MONEY":
|
|
|
|
|
- attributes['price'].append(ent.text)
|
|
|
|
|
-
|
|
|
|
|
- # Deduplicate and clean
|
|
|
|
|
- cleaned_attributes = {}
|
|
|
|
|
- for key, values in attributes.items():
|
|
|
|
|
- if values:
|
|
|
|
|
- # Take most common or first occurrence
|
|
|
|
|
- cleaned_attributes[key] = list(set(values))[0] if len(set(values)) == 1 else values
|
|
|
|
|
- cleaned_attributes[f'{key}_confidence'] = 0.8 if len(set(values)) == 1 else 0.5
|
|
|
|
|
-
|
|
|
|
|
- return cleaned_attributes
|
|
|
|
|
-
|
|
|
|
|
- def _extract_text_from_images(self, image_paths: List[str]) -> str:
|
|
|
|
|
- """
|
|
|
|
|
- Extract text from product images using OCR
|
|
|
|
|
- """
|
|
|
|
|
- extracted_text = []
|
|
|
|
|
-
|
|
|
|
|
- for img_path in image_paths[:3]: # Limit to 3 images
|
|
|
|
|
- try:
|
|
|
|
|
- img = Image.open(img_path)
|
|
|
|
|
- text = pytesseract.image_to_string(img)
|
|
|
|
|
- if text.strip():
|
|
|
|
|
- extracted_text.append(text.strip())
|
|
|
|
|
- except Exception as e:
|
|
|
|
|
- print(f"OCR error for {img_path}: {e}")
|
|
|
|
|
-
|
|
|
|
|
- return " ".join(extracted_text)
|
|
|
|
|
-
|
|
|
|
|
- def _needs_llm_extraction(self, attributes: Dict, product_data: Dict) -> bool:
|
|
|
|
|
- """
|
|
|
|
|
- Determine if LLM extraction is needed based on confidence and completeness
|
|
|
|
|
- """
|
|
|
|
|
- # Check if critical attributes are missing
|
|
|
|
|
- critical_attrs = ['category', 'brand', 'color', 'size']
|
|
|
|
|
- missing_critical = any(attr not in attributes for attr in critical_attrs)
|
|
|
|
|
-
|
|
|
|
|
- # Check confidence levels
|
|
|
|
|
- low_confidence = any(
|
|
|
|
|
- attributes.get(f'{key}_confidence', 0) < self.confidence_threshold
|
|
|
|
|
- for key in attributes.keys() if not key.endswith('_confidence')
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- # Check if description is complex/unstructured
|
|
|
|
|
- description = product_data.get('description', '')
|
|
|
|
|
- is_complex = len(description.split()) > 100 or 'features' in description.lower()
|
|
|
|
|
-
|
|
|
|
|
- return missing_critical or low_confidence or is_complex
|
|
|
|
|
-
|
|
|
|
|
- def _extract_with_llm(self, product_data: Dict, existing_attrs: Dict, ocr_text: str) -> Dict:
|
|
|
|
|
- """
|
|
|
|
|
- Use LLM to extract comprehensive attributes and validate NLP results
|
|
|
|
|
- """
|
|
|
|
|
- prompt = f"""Analyze this product and extract ALL possible attributes with high accuracy.
|
|
|
|
|
-
|
|
|
|
|
-Title: {product_data.get('title', 'N/A')}
|
|
|
|
|
-Description: {product_data.get('description', 'N/A')}
|
|
|
|
|
-Short Description: {product_data.get('short_description', 'N/A')}
|
|
|
|
|
-Text from images (OCR): {ocr_text if ocr_text else 'N/A'}
|
|
|
|
|
-
|
|
|
|
|
-NLP Pre-extracted attributes (validate and enhance): {existing_attrs}
|
|
|
|
|
-
|
|
|
|
|
-Extract a comprehensive JSON object with these fields (include all that apply):
|
|
|
|
|
-
|
|
|
|
|
-**Basic Info:**
|
|
|
|
|
-- category: specific product category/type
|
|
|
|
|
-- subcategory: more specific classification
|
|
|
|
|
-- brand: brand name
|
|
|
|
|
-- model: model number/name
|
|
|
|
|
-- product_line: product series/collection
|
|
|
|
|
-
|
|
|
|
|
-**Physical Attributes:**
|
|
|
|
|
-- color: all colors (list if multiple)
|
|
|
|
|
-- size: size information (with units)
|
|
|
|
|
-- dimensions: length/width/height with units
|
|
|
|
|
-- weight: weight with units
|
|
|
|
|
-- material: materials used (list all)
|
|
|
|
|
-- finish: surface finish/texture
|
|
|
|
|
-
|
|
|
|
|
-**Technical Specs (if applicable):**
|
|
|
|
|
-- specifications: key technical specs as object
|
|
|
|
|
-- compatibility: what it works with
|
|
|
|
|
-- capacity: storage/volume capacity
|
|
|
|
|
-- power: power requirements/battery info
|
|
|
|
|
-
|
|
|
|
|
-**Commercial Info:**
|
|
|
|
|
-- condition: new/used/refurbished
|
|
|
|
|
-- warranty: warranty information
|
|
|
|
|
-- country_of_origin: manufacturing country
|
|
|
|
|
-- certifications: safety/quality certifications
|
|
|
|
|
-
|
|
|
|
|
-**Descriptive:**
|
|
|
|
|
-- key_features: list of 5-8 main features
|
|
|
|
|
-- benefits: main benefits/use cases
|
|
|
|
|
-- target_audience: who this is for
|
|
|
|
|
-- usage_instructions: how to use (if mentioned)
|
|
|
|
|
-- care_instructions: care/maintenance info
|
|
|
|
|
-- style: style/aesthetic (modern, vintage, etc)
|
|
|
|
|
-- season: seasonal relevance (if applicable)
|
|
|
|
|
-- occasion: suitable occasions (if applicable)
|
|
|
|
|
-
|
|
|
|
|
-**Additional:**
|
|
|
|
|
-- package_contents: what's included
|
|
|
|
|
-- variants: available variants/options
|
|
|
|
|
-- tags: relevant search tags (list)
|
|
|
|
|
-
|
|
|
|
|
-Only include fields where you have high confidence. Use null for uncertain values.
|
|
|
|
|
-For lists, provide all relevant items. Be thorough and extract every possible detail."""
|
|
|
|
|
-
|
|
|
|
|
- content = [{"type": "text", "text": prompt}]
|
|
|
|
|
-
|
|
|
|
|
- # Add images if available
|
|
|
|
|
- if product_data.get('images'):
|
|
|
|
|
- for img_path in product_data['images'][:3]: # Include up to 3 images for better context
|
|
|
|
|
- try:
|
|
|
|
|
- with open(img_path, 'rb') as f:
|
|
|
|
|
- img_data = base64.b64encode(f.read()).decode()
|
|
|
|
|
-
|
|
|
|
|
- # Determine media type
|
|
|
|
|
- media_type = "image/jpeg"
|
|
|
|
|
- if img_path.lower().endswith('.png'):
|
|
|
|
|
- media_type = "image/png"
|
|
|
|
|
- elif img_path.lower().endswith('.webp'):
|
|
|
|
|
- media_type = "image/webp"
|
|
|
|
|
-
|
|
|
|
|
- content.append({
|
|
|
|
|
- "type": "image",
|
|
|
|
|
- "source": {
|
|
|
|
|
- "type": "base64",
|
|
|
|
|
- "media_type": media_type,
|
|
|
|
|
- "data": img_data
|
|
|
|
|
- }
|
|
|
|
|
- })
|
|
|
|
|
- except Exception as e:
|
|
|
|
|
- print(f"Error processing image {img_path}: {e}")
|
|
|
|
|
-
|
|
|
|
|
- try:
|
|
|
|
|
- response = self.client.messages.create(
|
|
|
|
|
- model="claude-sonnet-4-20250514",
|
|
|
|
|
- max_tokens=2048, # Increased for comprehensive extraction
|
|
|
|
|
- messages=[{"role": "user", "content": content}]
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- # Parse JSON response
|
|
|
|
|
- import json
|
|
|
|
|
- llm_result = json.loads(response.content[0].text)
|
|
|
|
|
-
|
|
|
|
|
- # Add high confidence to LLM results
|
|
|
|
|
- for key in llm_result:
|
|
|
|
|
- if llm_result[key] is not None:
|
|
|
|
|
- llm_result[f'{key}_confidence'] = 0.95
|
|
|
|
|
-
|
|
|
|
|
- return llm_result
|
|
|
|
|
-
|
|
|
|
|
- except Exception as e:
|
|
|
|
|
- print(f"LLM extraction error: {e}")
|
|
|
|
|
- return {}
|
|
|
|
|
-
|
|
|
|
|
- def _identify_missing_attributes(self, existing_attrs: Dict) -> List[str]:
|
|
|
|
|
- """
|
|
|
|
|
- Identify which attributes are missing or low confidence
|
|
|
|
|
- """
|
|
|
|
|
- important_attrs = ['category', 'brand', 'color', 'size', 'material', 'key_features']
|
|
|
|
|
- missing = []
|
|
|
|
|
-
|
|
|
|
|
- for attr in important_attrs:
|
|
|
|
|
- if attr not in existing_attrs or existing_attrs.get(f'{attr}_confidence', 0) < 0.7:
|
|
|
|
|
- missing.append(attr)
|
|
|
|
|
-
|
|
|
|
|
- return missing
|
|
|
|
|
-
|
|
|
|
|
- def _merge_attributes(self, base: Dict, additional: Dict) -> Dict:
|
|
|
|
|
- """
|
|
|
|
|
- Intelligently merge attributes, preferring LLM for new attributes and validation
|
|
|
|
|
- """
|
|
|
|
|
- merged = {}
|
|
|
|
|
-
|
|
|
|
|
- # Start with all NLP attributes
|
|
|
|
|
- for key, value in base.items():
|
|
|
|
|
- if not key.endswith('_confidence'):
|
|
|
|
|
- merged[key] = value
|
|
|
|
|
- merged[f'{key}_confidence'] = base.get(f'{key}_confidence', 0.7)
|
|
|
|
|
-
|
|
|
|
|
- # Add or override with LLM attributes
|
|
|
|
|
- for key, value in additional.items():
|
|
|
|
|
- if key.endswith('_confidence'):
|
|
|
|
|
- continue
|
|
|
|
|
-
|
|
|
|
|
- if value is None:
|
|
|
|
|
- # Keep NLP value if LLM returns null
|
|
|
|
|
- continue
|
|
|
|
|
-
|
|
|
|
|
- # LLM found new attribute or better value
|
|
|
|
|
- if key not in merged:
|
|
|
|
|
- merged[key] = value
|
|
|
|
|
- merged[f'{key}_confidence'] = additional.get(f'{key}_confidence', 0.95)
|
|
|
|
|
- else:
|
|
|
|
|
- # Compare values - if different, prefer LLM but mark for review
|
|
|
|
|
- llm_conf = additional.get(f'{key}_confidence', 0.95)
|
|
|
|
|
- nlp_conf = merged.get(f'{key}_confidence', 0.7)
|
|
|
|
|
-
|
|
|
|
|
- if str(value).lower() != str(merged[key]).lower():
|
|
|
|
|
- # Values differ - use LLM but add conflict flag
|
|
|
|
|
- merged[key] = value
|
|
|
|
|
- merged[f'{key}_confidence'] = llm_conf
|
|
|
|
|
- merged[f'{key}_nlp_value'] = base.get(key) # Store NLP value for reference
|
|
|
|
|
- merged[f'{key}_conflict'] = True
|
|
|
|
|
- else:
|
|
|
|
|
- # Values match - boost confidence
|
|
|
|
|
- merged[key] = value
|
|
|
|
|
- merged[f'{key}_confidence'] = min(0.99, (llm_conf + nlp_conf) / 2 + 0.1)
|
|
|
|
|
-
|
|
|
|
|
- return merged
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-# Example usage
|
|
|
|
|
-if __name__ == "__main__":
|
|
|
|
|
- extractor = HybridAttributeExtractor(anthropic_api_key="your-api-key")
|
|
|
|
|
-
|
|
|
|
|
- product = {
|
|
|
|
|
- 'title': 'Nike Air Max 270 Running Shoes - Black/White',
|
|
|
|
|
- 'description': 'Premium running shoes with Max Air cushioning. Breathable mesh upper, rubber outsole. Perfect for daily training.',
|
|
|
|
|
- 'images': ['path/to/image1.jpg', 'path/to/image2.jpg']
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- attributes = extractor.extract_attributes(product)
|
|
|
|
|
- print(attributes)
|
|
|