harshit.pathak
/
content_quality_tool


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
							# services/attribute_extractor.py
import re
import spacy
from typing import Dict, List, Optional
from anthropic import Anthropic
import base64
from PIL import Image
import pytesseract
from collections import defaultdict

class HybridAttributeExtractor:
    """
    Hybrid extractor using NLP for structured data and LLM for complex/ambiguous cases
    """
    
    def __init__(self, anthropic_api_key: str, product_type_mappings: Dict = None):
        self.nlp = spacy.load("en_core_web_sm")
        self.client = Anthropic(api_key=anthropic_api_key)
        self.product_type_mappings = product_type_mappings or self._load_default_mappings()
        
        # Define patterns for common attributes
        self.patterns = {
            'size': [
                r'\b(XXS|XS|S|M|L|XL|XXL|XXXL)\b',
                r'\b(\d+(?:\.\d+)?)\s*(inch|inches|cm|mm|meter|metres?|ft|feet|")\b',
                r'\b(small|medium|large|extra large)\b'
            ],
            'color': [
                r'\b(black|white|red|blue|green|yellow|orange|purple|pink|brown|gray|grey|silver|gold|beige|navy|maroon|olive|teal|turquoise|lavender|cream|ivory)\b'
            ],
            'weight': [
                r'\b(\d+(?:\.\d+)?)\s*(kg|g|lb|lbs|oz|pounds?|grams?|kilograms?)\b'
            ],
            'material': [
                r'\b(cotton|polyester|silk|wool|leather|denim|linen|nylon|spandex|rayon|acrylic|metal|plastic|wood|glass|ceramic|steel|aluminum|rubber)\b'
            ],
            'brand': [
                r'(?:by|from|brand:?)\s+([A-Z][a-zA-Z0-9\s&]+?)(?:\s|$|,|\.|;)'
            ]
        }
        
        # Confidence thresholds
        self.confidence_threshold = 0.6
        
    def extract_attributes(self, product_data: Dict) -> Dict:
        """
        Main extraction method - uses NLP first, LLM for gaps
        """
        # Phase 1: Quick NLP extraction
        nlp_attributes = self._extract_with_nlp(
            product_data.get('title', ''),
            product_data.get('description', '')
        )
        
        # Phase 2: OCR from images if provided
        ocr_text = ""
        if product_data.get('images'):
            ocr_text = self._extract_text_from_images(product_data['images'])
            if ocr_text:
                ocr_attributes = self._extract_with_nlp("", ocr_text)
                nlp_attributes = self._merge_attributes(nlp_attributes, ocr_attributes)
        
        # Phase 3: Always call LLM to enrich and validate NLP results
        llm_attributes = self._extract_with_llm(
            product_data,
            nlp_attributes,
            ocr_text
        )
        final_attributes = self._merge_attributes(nlp_attributes, llm_attributes)
        
        return final_attributes
    
    def _extract_with_nlp(self, title: str, description: str) -> Dict:
        """
        Fast extraction using regex and spaCy
        """
        text = f"{title} {description}".lower()
        attributes = defaultdict(list)
        
        # Pattern matching for structured attributes
        for attr_type, patterns in self.patterns.items():
            for pattern in patterns:
                matches = re.finditer(pattern, text, re.IGNORECASE)
                for match in matches:
                    value = match.group(1) if match.groups() else match.group(0)
                    attributes[attr_type].append(value.strip())
        
        # Named Entity Recognition for brands, organizations
        doc = self.nlp(title + " " + description)
        for ent in doc.ents:
            if ent.label_ == "ORG" and 'brand' not in attributes:
                attributes['brand'].append(ent.text)
            elif ent.label_ == "PRODUCT":
                attributes['product_type'].append(ent.text)
            elif ent.label_ == "MONEY":
                attributes['price'].append(ent.text)
        
        # Deduplicate and clean
        cleaned_attributes = {}
        for key, values in attributes.items():
            if values:
                # Take most common or first occurrence
                cleaned_attributes[key] = list(set(values))[0] if len(set(values)) == 1 else values
                cleaned_attributes[f'{key}_confidence'] = 0.8 if len(set(values)) == 1 else 0.5
        
        return cleaned_attributes
    
    def _extract_text_from_images(self, image_paths: List[str]) -> str:
        """
        Extract text from product images using OCR
        """
        extracted_text = []
        
        for img_path in image_paths[:3]:  # Limit to 3 images
            try:
                img = Image.open(img_path)
                text = pytesseract.image_to_string(img)
                if text.strip():
                    extracted_text.append(text.strip())
            except Exception as e:
                print(f"OCR error for {img_path}: {e}")
        
        return " ".join(extracted_text)
    
    def _needs_llm_extraction(self, attributes: Dict, product_data: Dict) -> bool:
        """
        Determine if LLM extraction is needed based on confidence and completeness
        """
        # Check if critical attributes are missing
        critical_attrs = ['category', 'brand', 'color', 'size']
        missing_critical = any(attr not in attributes for attr in critical_attrs)
        
        # Check confidence levels
        low_confidence = any(
            attributes.get(f'{key}_confidence', 0) < self.confidence_threshold
            for key in attributes.keys() if not key.endswith('_confidence')
        )
        
        # Check if description is complex/unstructured
        description = product_data.get('description', '')
        is_complex = len(description.split()) > 100 or 'features' in description.lower()
        
        return missing_critical or low_confidence or is_complex
    
    def _extract_with_llm(self, product_data: Dict, existing_attrs: Dict, ocr_text: str) -> Dict:
        """
        Use LLM to extract comprehensive attributes and validate NLP results
        """
        prompt = f"""Analyze this product and extract ALL possible attributes with high accuracy.

Title: {product_data.get('title', 'N/A')}
Description: {product_data.get('description', 'N/A')}
Short Description: {product_data.get('short_description', 'N/A')}
Text from images (OCR): {ocr_text if ocr_text else 'N/A'}

NLP Pre-extracted attributes (validate and enhance): {existing_attrs}

Extract a comprehensive JSON object with these fields (include all that apply):

**Basic Info:**
- category: specific product category/type
- subcategory: more specific classification
- brand: brand name
- model: model number/name
- product_line: product series/collection

**Physical Attributes:**
- color: all colors (list if multiple)
- size: size information (with units)
- dimensions: length/width/height with units
- weight: weight with units
- material: materials used (list all)
- finish: surface finish/texture

**Technical Specs (if applicable):**
- specifications: key technical specs as object
- compatibility: what it works with
- capacity: storage/volume capacity
- power: power requirements/battery info

**Commercial Info:**
- condition: new/used/refurbished
- warranty: warranty information
- country_of_origin: manufacturing country
- certifications: safety/quality certifications

**Descriptive:**
- key_features: list of 5-8 main features
- benefits: main benefits/use cases
- target_audience: who this is for
- usage_instructions: how to use (if mentioned)
- care_instructions: care/maintenance info
- style: style/aesthetic (modern, vintage, etc)
- season: seasonal relevance (if applicable)
- occasion: suitable occasions (if applicable)

**Additional:**
- package_contents: what's included
- variants: available variants/options
- tags: relevant search tags (list)

Only include fields where you have high confidence. Use null for uncertain values.
For lists, provide all relevant items. Be thorough and extract every possible detail."""

        content = [{"type": "text", "text": prompt}]
        
        # Add images if available
        if product_data.get('images'):
            for img_path in product_data['images'][:3]:  # Include up to 3 images for better context
                try:
                    with open(img_path, 'rb') as f:
                        img_data = base64.b64encode(f.read()).decode()
                    
                    # Determine media type
                    media_type = "image/jpeg"
                    if img_path.lower().endswith('.png'):
                        media_type = "image/png"
                    elif img_path.lower().endswith('.webp'):
                        media_type = "image/webp"
                    
                    content.append({
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": media_type,
                            "data": img_data
                        }
                    })
                except Exception as e:
                    print(f"Error processing image {img_path}: {e}")
        
        try:
            response = self.client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=2048,  # Increased for comprehensive extraction
                messages=[{"role": "user", "content": content}]
            )
            
            # Parse JSON response
            import json
            llm_result = json.loads(response.content[0].text)
            
            # Add high confidence to LLM results
            for key in llm_result:
                if llm_result[key] is not None:
                    llm_result[f'{key}_confidence'] = 0.95
            
            return llm_result
        
        except Exception as e:
            print(f"LLM extraction error: {e}")
            return {}
    
    def _identify_missing_attributes(self, existing_attrs: Dict) -> List[str]:
        """
        Identify which attributes are missing or low confidence
        """
        important_attrs = ['category', 'brand', 'color', 'size', 'material', 'key_features']
        missing = []
        
        for attr in important_attrs:
            if attr not in existing_attrs or existing_attrs.get(f'{attr}_confidence', 0) < 0.7:
                missing.append(attr)
        
        return missing
    
    def _merge_attributes(self, base: Dict, additional: Dict) -> Dict:
        """
        Intelligently merge attributes, preferring LLM for new attributes and validation
        """
        merged = {}
        
        # Start with all NLP attributes
        for key, value in base.items():
            if not key.endswith('_confidence'):
                merged[key] = value
                merged[f'{key}_confidence'] = base.get(f'{key}_confidence', 0.7)
        
        # Add or override with LLM attributes
        for key, value in additional.items():
            if key.endswith('_confidence'):
                continue
            
            if value is None:
                # Keep NLP value if LLM returns null
                continue
            
            # LLM found new attribute or better value
            if key not in merged:
                merged[key] = value
                merged[f'{key}_confidence'] = additional.get(f'{key}_confidence', 0.95)
            else:
                # Compare values - if different, prefer LLM but mark for review
                llm_conf = additional.get(f'{key}_confidence', 0.95)
                nlp_conf = merged.get(f'{key}_confidence', 0.7)
                
                if str(value).lower() != str(merged[key]).lower():
                    # Values differ - use LLM but add conflict flag
                    merged[key] = value
                    merged[f'{key}_confidence'] = llm_conf
                    merged[f'{key}_nlp_value'] = base.get(key)  # Store NLP value for reference
                    merged[f'{key}_conflict'] = True
                else:
                    # Values match - boost confidence
                    merged[key] = value
                    merged[f'{key}_confidence'] = min(0.99, (llm_conf + nlp_conf) / 2 + 0.1)
        
        return merged


# Example usage
if __name__ == "__main__":
    extractor = HybridAttributeExtractor(anthropic_api_key="your-api-key")
    
    product = {
        'title': 'Nike Air Max 270 Running Shoes - Black/White',
        'description': 'Premium running shoes with Max Air cushioning. Breathable mesh upper, rubber outsole. Perfect for daily training.',
        'images': ['path/to/image1.jpg', 'path/to/image2.jpg']
    }
    
    attributes = extractor.extract_attributes(product)
    print(attributes)