attribute_extractor.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
  1. # services/attribute_extractor.py
  2. import re
  3. import spacy
  4. from typing import Dict, List, Optional
  5. from anthropic import Anthropic
  6. import base64
  7. from PIL import Image
  8. import pytesseract
  9. from collections import defaultdict
  10. class HybridAttributeExtractor:
  11. """
  12. Hybrid extractor using NLP for structured data and LLM for complex/ambiguous cases
  13. """
  14. def __init__(self, anthropic_api_key: str, product_type_mappings: Dict = None):
  15. self.nlp = spacy.load("en_core_web_sm")
  16. self.client = Anthropic(api_key=anthropic_api_key)
  17. self.product_type_mappings = product_type_mappings or self._load_default_mappings()
  18. # Define patterns for common attributes
  19. self.patterns = {
  20. 'size': [
  21. r'\b(XXS|XS|S|M|L|XL|XXL|XXXL)\b',
  22. r'\b(\d+(?:\.\d+)?)\s*(inch|inches|cm|mm|meter|metres?|ft|feet|")\b',
  23. r'\b(small|medium|large|extra large)\b'
  24. ],
  25. 'color': [
  26. r'\b(black|white|red|blue|green|yellow|orange|purple|pink|brown|gray|grey|silver|gold|beige|navy|maroon|olive|teal|turquoise|lavender|cream|ivory)\b'
  27. ],
  28. 'weight': [
  29. r'\b(\d+(?:\.\d+)?)\s*(kg|g|lb|lbs|oz|pounds?|grams?|kilograms?)\b'
  30. ],
  31. 'material': [
  32. r'\b(cotton|polyester|silk|wool|leather|denim|linen|nylon|spandex|rayon|acrylic|metal|plastic|wood|glass|ceramic|steel|aluminum|rubber)\b'
  33. ],
  34. 'brand': [
  35. r'(?:by|from|brand:?)\s+([A-Z][a-zA-Z0-9\s&]+?)(?:\s|$|,|\.|;)'
  36. ]
  37. }
  38. # Confidence thresholds
  39. self.confidence_threshold = 0.6
  40. def extract_attributes(self, product_data: Dict) -> Dict:
  41. """
  42. Main extraction method - uses NLP first, LLM for gaps
  43. """
  44. # Phase 1: Quick NLP extraction
  45. nlp_attributes = self._extract_with_nlp(
  46. product_data.get('title', ''),
  47. product_data.get('description', '')
  48. )
  49. # Phase 2: OCR from images if provided
  50. ocr_text = ""
  51. if product_data.get('images'):
  52. ocr_text = self._extract_text_from_images(product_data['images'])
  53. if ocr_text:
  54. ocr_attributes = self._extract_with_nlp("", ocr_text)
  55. nlp_attributes = self._merge_attributes(nlp_attributes, ocr_attributes)
  56. # Phase 3: Always call LLM to enrich and validate NLP results
  57. llm_attributes = self._extract_with_llm(
  58. product_data,
  59. nlp_attributes,
  60. ocr_text
  61. )
  62. final_attributes = self._merge_attributes(nlp_attributes, llm_attributes)
  63. return final_attributes
  64. def _extract_with_nlp(self, title: str, description: str) -> Dict:
  65. """
  66. Fast extraction using regex and spaCy
  67. """
  68. text = f"{title} {description}".lower()
  69. attributes = defaultdict(list)
  70. # Pattern matching for structured attributes
  71. for attr_type, patterns in self.patterns.items():
  72. for pattern in patterns:
  73. matches = re.finditer(pattern, text, re.IGNORECASE)
  74. for match in matches:
  75. value = match.group(1) if match.groups() else match.group(0)
  76. attributes[attr_type].append(value.strip())
  77. # Named Entity Recognition for brands, organizations
  78. doc = self.nlp(title + " " + description)
  79. for ent in doc.ents:
  80. if ent.label_ == "ORG" and 'brand' not in attributes:
  81. attributes['brand'].append(ent.text)
  82. elif ent.label_ == "PRODUCT":
  83. attributes['product_type'].append(ent.text)
  84. elif ent.label_ == "MONEY":
  85. attributes['price'].append(ent.text)
  86. # Deduplicate and clean
  87. cleaned_attributes = {}
  88. for key, values in attributes.items():
  89. if values:
  90. # Take most common or first occurrence
  91. cleaned_attributes[key] = list(set(values))[0] if len(set(values)) == 1 else values
  92. cleaned_attributes[f'{key}_confidence'] = 0.8 if len(set(values)) == 1 else 0.5
  93. return cleaned_attributes
  94. def _extract_text_from_images(self, image_paths: List[str]) -> str:
  95. """
  96. Extract text from product images using OCR
  97. """
  98. extracted_text = []
  99. for img_path in image_paths[:3]: # Limit to 3 images
  100. try:
  101. img = Image.open(img_path)
  102. text = pytesseract.image_to_string(img)
  103. if text.strip():
  104. extracted_text.append(text.strip())
  105. except Exception as e:
  106. print(f"OCR error for {img_path}: {e}")
  107. return " ".join(extracted_text)
  108. def _needs_llm_extraction(self, attributes: Dict, product_data: Dict) -> bool:
  109. """
  110. Determine if LLM extraction is needed based on confidence and completeness
  111. """
  112. # Check if critical attributes are missing
  113. critical_attrs = ['category', 'brand', 'color', 'size']
  114. missing_critical = any(attr not in attributes for attr in critical_attrs)
  115. # Check confidence levels
  116. low_confidence = any(
  117. attributes.get(f'{key}_confidence', 0) < self.confidence_threshold
  118. for key in attributes.keys() if not key.endswith('_confidence')
  119. )
  120. # Check if description is complex/unstructured
  121. description = product_data.get('description', '')
  122. is_complex = len(description.split()) > 100 or 'features' in description.lower()
  123. return missing_critical or low_confidence or is_complex
  124. def _extract_with_llm(self, product_data: Dict, existing_attrs: Dict, ocr_text: str) -> Dict:
  125. """
  126. Use LLM to extract comprehensive attributes and validate NLP results
  127. """
  128. prompt = f"""Analyze this product and extract ALL possible attributes with high accuracy.
  129. Title: {product_data.get('title', 'N/A')}
  130. Description: {product_data.get('description', 'N/A')}
  131. Short Description: {product_data.get('short_description', 'N/A')}
  132. Text from images (OCR): {ocr_text if ocr_text else 'N/A'}
  133. NLP Pre-extracted attributes (validate and enhance): {existing_attrs}
  134. Extract a comprehensive JSON object with these fields (include all that apply):
  135. **Basic Info:**
  136. - category: specific product category/type
  137. - subcategory: more specific classification
  138. - brand: brand name
  139. - model: model number/name
  140. - product_line: product series/collection
  141. **Physical Attributes:**
  142. - color: all colors (list if multiple)
  143. - size: size information (with units)
  144. - dimensions: length/width/height with units
  145. - weight: weight with units
  146. - material: materials used (list all)
  147. - finish: surface finish/texture
  148. **Technical Specs (if applicable):**
  149. - specifications: key technical specs as object
  150. - compatibility: what it works with
  151. - capacity: storage/volume capacity
  152. - power: power requirements/battery info
  153. **Commercial Info:**
  154. - condition: new/used/refurbished
  155. - warranty: warranty information
  156. - country_of_origin: manufacturing country
  157. - certifications: safety/quality certifications
  158. **Descriptive:**
  159. - key_features: list of 5-8 main features
  160. - benefits: main benefits/use cases
  161. - target_audience: who this is for
  162. - usage_instructions: how to use (if mentioned)
  163. - care_instructions: care/maintenance info
  164. - style: style/aesthetic (modern, vintage, etc)
  165. - season: seasonal relevance (if applicable)
  166. - occasion: suitable occasions (if applicable)
  167. **Additional:**
  168. - package_contents: what's included
  169. - variants: available variants/options
  170. - tags: relevant search tags (list)
  171. Only include fields where you have high confidence. Use null for uncertain values.
  172. For lists, provide all relevant items. Be thorough and extract every possible detail."""
  173. content = [{"type": "text", "text": prompt}]
  174. # Add images if available
  175. if product_data.get('images'):
  176. for img_path in product_data['images'][:3]: # Include up to 3 images for better context
  177. try:
  178. with open(img_path, 'rb') as f:
  179. img_data = base64.b64encode(f.read()).decode()
  180. # Determine media type
  181. media_type = "image/jpeg"
  182. if img_path.lower().endswith('.png'):
  183. media_type = "image/png"
  184. elif img_path.lower().endswith('.webp'):
  185. media_type = "image/webp"
  186. content.append({
  187. "type": "image",
  188. "source": {
  189. "type": "base64",
  190. "media_type": media_type,
  191. "data": img_data
  192. }
  193. })
  194. except Exception as e:
  195. print(f"Error processing image {img_path}: {e}")
  196. try:
  197. response = self.client.messages.create(
  198. model="claude-sonnet-4-20250514",
  199. max_tokens=2048, # Increased for comprehensive extraction
  200. messages=[{"role": "user", "content": content}]
  201. )
  202. # Parse JSON response
  203. import json
  204. llm_result = json.loads(response.content[0].text)
  205. # Add high confidence to LLM results
  206. for key in llm_result:
  207. if llm_result[key] is not None:
  208. llm_result[f'{key}_confidence'] = 0.95
  209. return llm_result
  210. except Exception as e:
  211. print(f"LLM extraction error: {e}")
  212. return {}
  213. def _identify_missing_attributes(self, existing_attrs: Dict) -> List[str]:
  214. """
  215. Identify which attributes are missing or low confidence
  216. """
  217. important_attrs = ['category', 'brand', 'color', 'size', 'material', 'key_features']
  218. missing = []
  219. for attr in important_attrs:
  220. if attr not in existing_attrs or existing_attrs.get(f'{attr}_confidence', 0) < 0.7:
  221. missing.append(attr)
  222. return missing
  223. def _merge_attributes(self, base: Dict, additional: Dict) -> Dict:
  224. """
  225. Intelligently merge attributes, preferring LLM for new attributes and validation
  226. """
  227. merged = {}
  228. # Start with all NLP attributes
  229. for key, value in base.items():
  230. if not key.endswith('_confidence'):
  231. merged[key] = value
  232. merged[f'{key}_confidence'] = base.get(f'{key}_confidence', 0.7)
  233. # Add or override with LLM attributes
  234. for key, value in additional.items():
  235. if key.endswith('_confidence'):
  236. continue
  237. if value is None:
  238. # Keep NLP value if LLM returns null
  239. continue
  240. # LLM found new attribute or better value
  241. if key not in merged:
  242. merged[key] = value
  243. merged[f'{key}_confidence'] = additional.get(f'{key}_confidence', 0.95)
  244. else:
  245. # Compare values - if different, prefer LLM but mark for review
  246. llm_conf = additional.get(f'{key}_confidence', 0.95)
  247. nlp_conf = merged.get(f'{key}_confidence', 0.7)
  248. if str(value).lower() != str(merged[key]).lower():
  249. # Values differ - use LLM but add conflict flag
  250. merged[key] = value
  251. merged[f'{key}_confidence'] = llm_conf
  252. merged[f'{key}_nlp_value'] = base.get(key) # Store NLP value for reference
  253. merged[f'{key}_conflict'] = True
  254. else:
  255. # Values match - boost confidence
  256. merged[key] = value
  257. merged[f'{key}_confidence'] = min(0.99, (llm_conf + nlp_conf) / 2 + 0.1)
  258. return merged
  259. # Example usage
  260. if __name__ == "__main__":
  261. extractor = HybridAttributeExtractor(anthropic_api_key="your-api-key")
  262. product = {
  263. 'title': 'Nike Air Max 270 Running Shoes - Black/White',
  264. 'description': 'Premium running shoes with Max Air cushioning. Breathable mesh upper, rubber outsole. Perfect for daily training.',
  265. 'images': ['path/to/image1.jpg', 'path/to/image2.jpg']
  266. }
  267. attributes = extractor.extract_attributes(product)
  268. print(attributes)