attribute_scorer.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828
  1. # # attribute_scorer.py (Enhanced)
  2. # import re
  3. # import time
  4. # from typing import Dict, List, Tuple
  5. # from rapidfuzz import fuzz, process
  6. # import spacy
  7. # from collections import defaultdict
  8. # import logging
  9. # logger = logging.getLogger(__name__)
  10. # class AttributeQualityScorer:
  11. # """
  12. # Enhanced scorer with AI-powered suggestions and robust error handling
  13. # """
  14. # def __init__(self, use_ai: bool = True):
  15. # # Load spaCy model
  16. # try:
  17. # self.nlp = spacy.load("en_core_web_sm")
  18. # except Exception as e:
  19. # logger.warning(f"spaCy model not loaded: {e}")
  20. # self.nlp = None
  21. # # Initialize Gemini service
  22. # self.use_ai = use_ai
  23. # if use_ai:
  24. # try:
  25. # from .gemini_service import GeminiAttributeService
  26. # self.ai_service = GeminiAttributeService()
  27. # except Exception as e:
  28. # logger.warning(f"Gemini service not available: {e}")
  29. # self.use_ai = False
  30. # self.ai_service = None
  31. # # Enhanced weights
  32. # self.weights = {
  33. # 'mandatory_fields': 0.40,
  34. # 'standardization': 0.30,
  35. # 'missing_values': 0.20,
  36. # 'consistency': 0.10
  37. # }
  38. # def score_product(self, product: Dict, category_rules: List[Dict], generate_ai_suggestions: bool = True) -> Dict:
  39. # """
  40. # Enhanced scoring with AI suggestions and guaranteed AI call
  41. # """
  42. # start_time = time.time()
  43. # attributes = product.get('attributes', {})
  44. # category = product.get('category', '')
  45. # # Initialize scores
  46. # scores = {
  47. # 'mandatory_fields': 0,
  48. # 'standardization': 0,
  49. # 'missing_values': 0,
  50. # 'consistency': 0
  51. # }
  52. # issues = []
  53. # suggestions = []
  54. # # --- Mandatory Fields ---
  55. # try:
  56. # mandatory_score, mandatory_issues, mandatory_suggestions = self._check_mandatory_fields(attributes, category_rules)
  57. # scores['mandatory_fields'] = mandatory_score
  58. # issues.extend(mandatory_issues)
  59. # suggestions.extend(mandatory_suggestions)
  60. # except Exception as e:
  61. # logger.error(f"[Mandatory Fields] Error: {e}")
  62. # scores['mandatory_fields'] = 0
  63. # # --- Standardization ---
  64. # try:
  65. # std_score, std_issues, std_suggestions = self._check_standardization(attributes, category_rules)
  66. # scores['standardization'] = std_score
  67. # issues.extend(std_issues)
  68. # suggestions.extend(std_suggestions)
  69. # except Exception as e:
  70. # logger.error(f"[Standardization] Error: {e}")
  71. # scores['standardization'] = 0
  72. # # --- Missing Values ---
  73. # try:
  74. # missing_score, missing_issues, missing_suggestions = self._check_missing_values(attributes, category_rules)
  75. # scores['missing_values'] = missing_score
  76. # issues.extend(missing_issues)
  77. # suggestions.extend(missing_suggestions)
  78. # except Exception as e:
  79. # logger.error(f"[Missing Values] Error: {e}")
  80. # scores['missing_values'] = 0
  81. # # --- Consistency ---
  82. # try:
  83. # consistency_score, consistency_issues = self._check_consistency(attributes, product.get('title', ''), product.get('description', ''))
  84. # scores['consistency'] = consistency_score
  85. # issues.extend(consistency_issues)
  86. # except Exception as e:
  87. # logger.error(f"[Consistency] Error: {e}")
  88. # scores['consistency'] = 0
  89. # # --- Final Score ---
  90. # final_score = sum(scores[key] * self.weights[key] for key in scores)
  91. # result = {
  92. # 'final_score': round(final_score, 2),
  93. # 'max_score': 100.0,
  94. # 'breakdown': scores,
  95. # 'issues': issues,
  96. # 'suggestions': suggestions,
  97. # 'weights': self.weights,
  98. # 'processing_time': round(time.time() - start_time, 3)
  99. # }
  100. # # --- AI Suggestions (Guaranteed Attempt) ---
  101. # if generate_ai_suggestions and self.use_ai:
  102. # try:
  103. # logger.info(f"Generating AI suggestions for SKU: {product.get('sku')}, issues count: {len(issues)}")
  104. # ai_suggestions = self.ai_service.generate_attribute_suggestions(
  105. # product,
  106. # issues,
  107. # category_rules
  108. # )
  109. # if not ai_suggestions:
  110. # logger.warning(f"AI service returned empty suggestions for SKU: {product.get('sku')}")
  111. # ai_suggestions = {"note": "No AI suggestions generated"}
  112. # result['ai_suggestions'] = ai_suggestions
  113. # except Exception as e:
  114. # logger.error(f"[AI Suggestions] Error: {e}")
  115. # result['ai_suggestions'] = {'error': str(e)}
  116. # return result
  117. # def _check_mandatory_fields(self, attributes: Dict, rules: List[Dict]) -> Tuple[float, List, List]:
  118. # """Enhanced mandatory field validation with detailed feedback"""
  119. # mandatory_rules = [r for r in rules if r.get('is_mandatory', False)]
  120. # if not mandatory_rules:
  121. # return 100.0, [], []
  122. # present_count = 0
  123. # issues = []
  124. # suggestions = []
  125. # for rule in mandatory_rules:
  126. # attr_name = rule['attribute_name']
  127. # # Check presence and validity
  128. # if attr_name in attributes and attributes[attr_name]:
  129. # value = str(attributes[attr_name]).strip()
  130. # if not value:
  131. # issues.append(f"Mandatory field '{attr_name}' is empty")
  132. # suggestions.append(f"Provide a non-empty value for {attr_name}")
  133. # continue
  134. # # Check length constraints
  135. # min_len = rule.get('min_length')
  136. # max_len = rule.get('max_length')
  137. # if min_len and len(value) < min_len:
  138. # issues.append(f"'{attr_name}' too short (min: {min_len} chars)")
  139. # suggestions.append(f"Expand {attr_name} to at least {min_len} characters")
  140. # continue
  141. # if max_len and len(value) > max_len:
  142. # issues.append(f"'{attr_name}' too long (max: {max_len} chars)")
  143. # suggestions.append(f"Shorten {attr_name} to {max_len} characters or less")
  144. # continue
  145. # # Check regex pattern
  146. # regex = rule.get('validation_regex')
  147. # if regex and not re.match(regex, value):
  148. # issues.append(f"'{attr_name}' format invalid")
  149. # suggestions.append(f"Ensure {attr_name} matches required format")
  150. # continue
  151. # present_count += 1
  152. # else:
  153. # issues.append(f"Missing mandatory field: {attr_name}")
  154. # desc = rule.get('description', '')
  155. # if desc:
  156. # suggestions.append(f"Add {attr_name}: {desc}")
  157. # else:
  158. # suggestions.append(f"Add required attribute: {attr_name}")
  159. # score = (present_count / len(mandatory_rules)) * 100 if mandatory_rules else 100.0
  160. # return score, issues, suggestions
  161. # def _check_standardization(self, attributes: Dict, rules: List[Dict]) -> Tuple[float, List, List]:
  162. # """Enhanced standardization with better fuzzy matching"""
  163. # standardized_rules = [r for r in rules if r.get('valid_values')]
  164. # if not standardized_rules:
  165. # return 100.0, [], []
  166. # correct_count = 0
  167. # issues = []
  168. # suggestions = []
  169. # for rule in standardized_rules:
  170. # attr_name = rule['attribute_name']
  171. # valid_values = rule['valid_values']
  172. # if attr_name not in attributes or not attributes[attr_name]:
  173. # continue
  174. # actual_value = str(attributes[attr_name]).strip()
  175. # if not actual_value:
  176. # continue
  177. # # Exact match (case-insensitive)
  178. # if actual_value in valid_values:
  179. # correct_count += 1
  180. # continue
  181. # # Case-insensitive match
  182. # lower_valid = {v.lower(): v for v in valid_values}
  183. # if actual_value.lower() in lower_valid:
  184. # correct_count += 1
  185. # if actual_value != lower_valid[actual_value.lower()]:
  186. # issues.append(f"{attr_name}: Case mismatch - '{actual_value}' should be '{lower_valid[actual_value.lower()]}'")
  187. # suggestions.append(f"Correct capitalization of {attr_name} to: {lower_valid[actual_value.lower()]}")
  188. # continue
  189. # # Fuzzy matching with multiple scorers
  190. # best_match = None
  191. # best_score = 0
  192. # for scorer in [fuzz.ratio, fuzz.partial_ratio, fuzz.token_sort_ratio]:
  193. # match = process.extractOne(actual_value, valid_values, scorer=scorer)
  194. # if match and match[1] > best_score:
  195. # best_match = match
  196. # best_score = match[1]
  197. # if best_match and best_score >= 80:
  198. # correct_count += 1
  199. # if best_score < 100:
  200. # issues.append(f"{attr_name}: '{actual_value}' likely means '{best_match[0]}' (confidence: {best_score}%)")
  201. # suggestions.append(f"Standardize {attr_name} to: {best_match[0]}")
  202. # else:
  203. # issues.append(f"{attr_name}: '{actual_value}' not recognized. Valid: {', '.join(valid_values[:5])}")
  204. # suggestions.append(f"Change {attr_name} to one of: {', '.join(valid_values[:3])}")
  205. # score = (correct_count / len(standardized_rules)) * 100 if standardized_rules else 100.0
  206. # return score, issues, suggestions
  207. # def _check_missing_values(self, attributes: Dict, rules: List[Dict]) -> Tuple[float, List, List]:
  208. # """Enhanced placeholder detection"""
  209. # placeholder_patterns = [
  210. # r'^n/?a$', r'^none$', r'^null$', r'^-+$', r'^\.+$',
  211. # r'^tbd$', r'^to be determined$', r'^unknown$', r'^na$',
  212. # r'^todo$', r'^pending$', r'^\?+$', r'^xxx+$', r'^placeholder$'
  213. # ]
  214. # total_attrs = len(rules)
  215. # valid_count = 0
  216. # issues = []
  217. # suggestions = []
  218. # for rule in rules:
  219. # attr_name = rule['attribute_name']
  220. # if attr_name not in attributes:
  221. # continue
  222. # value = str(attributes[attr_name]).strip()
  223. # # Check if empty
  224. # if not value:
  225. # issues.append(f"'{attr_name}' is empty")
  226. # suggestions.append(f"Provide a valid value for {attr_name}")
  227. # continue
  228. # # Check if placeholder
  229. # value_lower = value.lower()
  230. # is_placeholder = any(re.match(pattern, value_lower, re.IGNORECASE) for pattern in placeholder_patterns)
  231. # if is_placeholder:
  232. # issues.append(f"'{attr_name}' contains placeholder: '{value}'")
  233. # suggestions.append(f"Replace placeholder in {attr_name} with actual data")
  234. # continue
  235. # # Check for suspicious patterns
  236. # if len(value) < 2 and rule.get('is_mandatory'):
  237. # issues.append(f"'{attr_name}' suspiciously short: '{value}'")
  238. # suggestions.append(f"Provide more detailed {attr_name}")
  239. # continue
  240. # valid_count += 1
  241. # score = (valid_count / total_attrs) * 100 if total_attrs > 0 else 100.0
  242. # return score, issues, suggestions
  243. # def _check_consistency(self, attributes: Dict, title: str, description: str) -> Tuple[float, List]:
  244. # """Enhanced consistency checking with context awareness"""
  245. # issues = []
  246. # consistency_count = 0
  247. # total_checks = 0
  248. # check_attrs = ['brand', 'color', 'size', 'material', 'model', 'weight', 'dimensions']
  249. # combined_text = f"{title} {description}".lower()
  250. # for attr in check_attrs:
  251. # if attr not in attributes or not attributes[attr]:
  252. # continue
  253. # total_checks += 1
  254. # attr_value = str(attributes[attr]).lower().strip()
  255. # # Skip very short values
  256. # if len(attr_value) < 2:
  257. # consistency_count += 1
  258. # continue
  259. # # Direct substring match
  260. # if attr_value in combined_text:
  261. # consistency_count += 1
  262. # continue
  263. # # Word boundary match
  264. # words_in_text = set(combined_text.split())
  265. # words_in_attr = set(attr_value.split())
  266. # if words_in_attr.issubset(words_in_text):
  267. # consistency_count += 1
  268. # continue
  269. # # Fuzzy word matching
  270. # text_words = combined_text.split()
  271. # matches = 0
  272. # for attr_word in words_in_attr:
  273. # match = process.extractOne(attr_word, text_words, scorer=fuzz.ratio)
  274. # if match and match[1] >= 80:
  275. # matches += 1
  276. # if matches / len(words_in_attr) >= 0.7:
  277. # consistency_count += 1
  278. # continue
  279. # issues.append(f"'{attr.title()}': '{attributes[attr]}' not clearly mentioned in title/description")
  280. # score = (consistency_count / total_checks) * 100 if total_checks > 0 else 100.0
  281. # return score, issues
  282. # def extract_attributes_from_text(self, text: str, category: str = '') -> Dict:
  283. # """Enhanced attribute extraction with AI fallback"""
  284. # extracted = {}
  285. # # Try spaCy first
  286. # if self.nlp:
  287. # extracted = self._extract_with_spacy(text)
  288. # # Use AI if available and spaCy found little
  289. # if self.use_ai and len(extracted) < 3:
  290. # try:
  291. # ai_extracted = self.ai_service.extract_attributes_with_ai(text, '', category)
  292. # extracted.update({k: v for k, v in ai_extracted.items() if v})
  293. # except Exception as e:
  294. # logger.error(f"AI extraction failed: {e}")
  295. # return extracted
  296. # def _extract_with_spacy(self, text: str) -> Dict:
  297. # """Extract using spaCy NER"""
  298. # doc = self.nlp(text)
  299. # extracted = defaultdict(list)
  300. # for ent in doc.ents:
  301. # if ent.label_ == 'ORG':
  302. # extracted['brand'].append(ent.text)
  303. # elif ent.label_ == 'QUANTITY':
  304. # extracted['size'].append(ent.text)
  305. # elif ent.label_ == 'PRODUCT':
  306. # extracted['product_type'].append(ent.text)
  307. # # Color detection
  308. # colors = ['red', 'blue', 'green', 'black', 'white', 'yellow', 'orange',
  309. # 'purple', 'pink', 'brown', 'gray', 'grey', 'silver', 'gold']
  310. # text_lower = text.lower()
  311. # for color in colors:
  312. # if color in text_lower:
  313. # extracted['color'].append(color.title())
  314. # # Return most common value
  315. # result = {}
  316. # for key, values in extracted.items():
  317. # if values:
  318. # result[key] = max(set(values), key=values.count)
  319. # return result
  320. # attribute_scorer.py (Enhanced with SEO)
  321. import re
  322. import time
  323. from typing import Dict, List, Tuple
  324. from rapidfuzz import fuzz, process
  325. import spacy
  326. from collections import defaultdict
  327. import logging
  328. logger = logging.getLogger(__name__)
  329. class AttributeQualityScorer:
  330. """
  331. Enhanced scorer with AI-powered suggestions, robust error handling, and SEO scoring
  332. """
  333. def __init__(self, use_ai: bool = True, use_seo: bool = True):
  334. # Load spaCy model
  335. try:
  336. self.nlp = spacy.load("en_core_web_sm")
  337. except Exception as e:
  338. logger.warning(f"spaCy model not loaded: {e}")
  339. self.nlp = None
  340. # Initialize Gemini service
  341. self.use_ai = use_ai
  342. if use_ai:
  343. try:
  344. from .gemini_service import GeminiAttributeService
  345. self.ai_service = GeminiAttributeService()
  346. except Exception as e:
  347. logger.warning(f"Gemini service not available: {e}")
  348. self.use_ai = False
  349. self.ai_service = None
  350. # Initialize SEO scorer
  351. self.use_seo = use_seo
  352. if use_seo:
  353. try:
  354. from .seo_scorer import SEODiscoverabilityScorer
  355. self.seo_scorer = SEODiscoverabilityScorer()
  356. except Exception as e:
  357. logger.warning(f"SEO scorer not available: {e}")
  358. self.use_seo = False
  359. self.seo_scorer = None
  360. # Updated weights to include SEO (total = 100%)
  361. self.weights = {
  362. 'mandatory_fields': 0.34, # Reduced from 40% -> 34%
  363. 'standardization': 0.26, # Reduced from 30% -> 26%
  364. 'missing_values': 0.17, # Reduced from 20% -> 17%
  365. 'consistency': 0.08, # Reduced from 10% -> 8%
  366. 'seo_discoverability': 0.15 # NEW: 15%
  367. }
  368. def score_product(self, product: Dict, category_rules: List[Dict], generate_ai_suggestions: bool = True) -> Dict:
  369. """
  370. Enhanced scoring with AI suggestions, SEO scoring, and guaranteed AI call
  371. """
  372. start_time = time.time()
  373. attributes = product.get('attributes', {})
  374. category = product.get('category', '')
  375. # Initialize scores
  376. scores = {
  377. 'mandatory_fields': 0,
  378. 'standardization': 0,
  379. 'missing_values': 0,
  380. 'consistency': 0,
  381. 'seo_discoverability': 0
  382. }
  383. issues = []
  384. suggestions = []
  385. # --- Mandatory Fields ---
  386. try:
  387. mandatory_score, mandatory_issues, mandatory_suggestions = self._check_mandatory_fields(attributes, category_rules)
  388. scores['mandatory_fields'] = mandatory_score
  389. issues.extend(mandatory_issues)
  390. suggestions.extend(mandatory_suggestions)
  391. except Exception as e:
  392. logger.error(f"[Mandatory Fields] Error: {e}")
  393. scores['mandatory_fields'] = 0
  394. # --- Standardization ---
  395. try:
  396. std_score, std_issues, std_suggestions = self._check_standardization(attributes, category_rules)
  397. scores['standardization'] = std_score
  398. issues.extend(std_issues)
  399. suggestions.extend(std_suggestions)
  400. except Exception as e:
  401. logger.error(f"[Standardization] Error: {e}")
  402. scores['standardization'] = 0
  403. # --- Missing Values ---
  404. try:
  405. missing_score, missing_issues, missing_suggestions = self._check_missing_values(attributes, category_rules)
  406. scores['missing_values'] = missing_score
  407. issues.extend(missing_issues)
  408. suggestions.extend(missing_suggestions)
  409. except Exception as e:
  410. logger.error(f"[Missing Values] Error: {e}")
  411. scores['missing_values'] = 0
  412. # --- Consistency ---
  413. try:
  414. consistency_score, consistency_issues = self._check_consistency(attributes, product.get('title', ''), product.get('description', ''))
  415. scores['consistency'] = consistency_score
  416. issues.extend(consistency_issues)
  417. except Exception as e:
  418. logger.error(f"[Consistency] Error: {e}")
  419. scores['consistency'] = 0
  420. # --- SEO & Discoverability (NEW) ---
  421. seo_result = None
  422. if self.use_seo and self.seo_scorer:
  423. try:
  424. seo_result = self.seo_scorer.score_seo(product, category_rules)
  425. scores['seo_discoverability'] = seo_result['seo_score']
  426. issues.extend(seo_result['issues'])
  427. suggestions.extend(seo_result['suggestions'])
  428. except Exception as e:
  429. logger.error(f"[SEO Scoring] Error: {e}")
  430. scores['seo_discoverability'] = 0
  431. # --- Final Score ---
  432. final_score = sum(scores[key] * self.weights[key] for key in scores)
  433. result = {
  434. 'final_score': round(final_score, 2),
  435. 'max_score': 100.0,
  436. 'breakdown': scores,
  437. 'issues': issues,
  438. 'suggestions': suggestions,
  439. 'weights': self.weights,
  440. 'processing_time': round(time.time() - start_time, 3)
  441. }
  442. # Add SEO-specific details
  443. if seo_result:
  444. result['seo_details'] = {
  445. 'breakdown': seo_result['breakdown'],
  446. 'extracted_keywords': seo_result.get('extracted_keywords', []),
  447. 'missing_high_value_terms': seo_result.get('missing_high_value_terms', [])
  448. }
  449. # --- AI Suggestions (Guaranteed Attempt) ---
  450. if generate_ai_suggestions and self.use_ai:
  451. try:
  452. logger.info(f"Generating AI suggestions for SKU: {product.get('sku')}, issues count: {len(issues)}")
  453. ai_suggestions = self.ai_service.generate_attribute_suggestions(
  454. product,
  455. issues,
  456. category_rules
  457. )
  458. if not ai_suggestions:
  459. logger.warning(f"AI service returned empty suggestions for SKU: {product.get('sku')}")
  460. ai_suggestions = {"note": "No AI suggestions generated"}
  461. result['ai_suggestions'] = ai_suggestions
  462. except Exception as e:
  463. logger.error(f"[AI Suggestions] Error: {e}")
  464. result['ai_suggestions'] = {'error': str(e)}
  465. return result
  466. def _check_mandatory_fields(self, attributes: Dict, rules: List[Dict]) -> Tuple[float, List, List]:
  467. """Enhanced mandatory field validation with detailed feedback"""
  468. mandatory_rules = [r for r in rules if r.get('is_mandatory', False)]
  469. if not mandatory_rules:
  470. return 100.0, [], []
  471. present_count = 0
  472. issues = []
  473. suggestions = []
  474. for rule in mandatory_rules:
  475. attr_name = rule['attribute_name']
  476. # Check presence and validity
  477. if attr_name in attributes and attributes[attr_name]:
  478. value = str(attributes[attr_name]).strip()
  479. if not value:
  480. issues.append(f"Mandatory field '{attr_name}' is empty")
  481. suggestions.append(f"Provide a non-empty value for {attr_name}")
  482. continue
  483. # Check length constraints
  484. min_len = rule.get('min_length')
  485. max_len = rule.get('max_length')
  486. if min_len and len(value) < min_len:
  487. issues.append(f"'{attr_name}' too short (min: {min_len} chars)")
  488. suggestions.append(f"Expand {attr_name} to at least {min_len} characters")
  489. continue
  490. if max_len and len(value) > max_len:
  491. issues.append(f"'{attr_name}' too long (max: {max_len} chars)")
  492. suggestions.append(f"Shorten {attr_name} to {max_len} characters or less")
  493. continue
  494. # Check regex pattern
  495. regex = rule.get('validation_regex')
  496. if regex and not re.match(regex, value):
  497. issues.append(f"'{attr_name}' format invalid")
  498. suggestions.append(f"Ensure {attr_name} matches required format")
  499. continue
  500. present_count += 1
  501. else:
  502. issues.append(f"Missing mandatory field: {attr_name}")
  503. desc = rule.get('description', '')
  504. if desc:
  505. suggestions.append(f"Add {attr_name}: {desc}")
  506. else:
  507. suggestions.append(f"Add required attribute: {attr_name}")
  508. score = (present_count / len(mandatory_rules)) * 100 if mandatory_rules else 100.0
  509. return score, issues, suggestions
  510. def _check_standardization(self, attributes: Dict, rules: List[Dict]) -> Tuple[float, List, List]:
  511. """Enhanced standardization with better fuzzy matching"""
  512. standardized_rules = [r for r in rules if r.get('valid_values')]
  513. if not standardized_rules:
  514. return 100.0, [], []
  515. correct_count = 0
  516. issues = []
  517. suggestions = []
  518. for rule in standardized_rules:
  519. attr_name = rule['attribute_name']
  520. valid_values = rule['valid_values']
  521. if attr_name not in attributes or not attributes[attr_name]:
  522. continue
  523. actual_value = str(attributes[attr_name]).strip()
  524. if not actual_value:
  525. continue
  526. # Exact match (case-insensitive)
  527. if actual_value in valid_values:
  528. correct_count += 1
  529. continue
  530. # Case-insensitive match
  531. lower_valid = {v.lower(): v for v in valid_values}
  532. if actual_value.lower() in lower_valid:
  533. correct_count += 1
  534. if actual_value != lower_valid[actual_value.lower()]:
  535. issues.append(f"{attr_name}: Case mismatch - '{actual_value}' should be '{lower_valid[actual_value.lower()]}'")
  536. suggestions.append(f"Correct capitalization of {attr_name} to: {lower_valid[actual_value.lower()]}")
  537. continue
  538. # Fuzzy matching with multiple scorers
  539. best_match = None
  540. best_score = 0
  541. for scorer in [fuzz.ratio, fuzz.partial_ratio, fuzz.token_sort_ratio]:
  542. match = process.extractOne(actual_value, valid_values, scorer=scorer)
  543. if match and match[1] > best_score:
  544. best_match = match
  545. best_score = match[1]
  546. if best_match and best_score >= 80:
  547. correct_count += 1
  548. if best_score < 100:
  549. issues.append(f"{attr_name}: '{actual_value}' likely means '{best_match[0]}' (confidence: {best_score}%)")
  550. suggestions.append(f"Standardize {attr_name} to: {best_match[0]}")
  551. else:
  552. issues.append(f"{attr_name}: '{actual_value}' not recognized. Valid: {', '.join(valid_values[:5])}")
  553. suggestions.append(f"Change {attr_name} to one of: {', '.join(valid_values[:3])}")
  554. score = (correct_count / len(standardized_rules)) * 100 if standardized_rules else 100.0
  555. return score, issues, suggestions
  556. def _check_missing_values(self, attributes: Dict, rules: List[Dict]) -> Tuple[float, List, List]:
  557. """Enhanced placeholder detection"""
  558. placeholder_patterns = [
  559. r'^n/?a', r'^none', r'^null', r'^-+', r'^\.+'
  560. r'^tbd', r'^to be determined', r'^unknown', r'^na',
  561. r'^todo', r'^pending', r'^\?+', r'^xxx+', r'^placeholder'
  562. ]
  563. total_attrs = len(rules)
  564. valid_count = 0
  565. issues = []
  566. suggestions = []
  567. for rule in rules:
  568. attr_name = rule['attribute_name']
  569. if attr_name not in attributes:
  570. continue
  571. value = str(attributes[attr_name]).strip()
  572. # Check if empty
  573. if not value:
  574. issues.append(f"'{attr_name}' is empty")
  575. suggestions.append(f"Provide a valid value for {attr_name}")
  576. continue
  577. # Check if placeholder
  578. value_lower = value.lower()
  579. is_placeholder = any(re.match(pattern, value_lower, re.IGNORECASE) for pattern in placeholder_patterns)
  580. if is_placeholder:
  581. issues.append(f"'{attr_name}' contains placeholder: '{value}'")
  582. suggestions.append(f"Replace placeholder in {attr_name} with actual data")
  583. continue
  584. # Check for suspicious patterns
  585. if len(value) < 2 and rule.get('is_mandatory'):
  586. issues.append(f"'{attr_name}' suspiciously short: '{value}'")
  587. suggestions.append(f"Provide more detailed {attr_name}")
  588. continue
  589. valid_count += 1
  590. score = (valid_count / total_attrs) * 100 if total_attrs > 0 else 100.0
  591. return score, issues, suggestions
  592. def _check_consistency(self, attributes: Dict, title: str, description: str) -> Tuple[float, List]:
  593. """Enhanced consistency checking with context awareness"""
  594. issues = []
  595. consistency_count = 0
  596. total_checks = 0
  597. check_attrs = ['brand', 'color', 'size', 'material', 'model', 'weight', 'dimensions']
  598. combined_text = f"{title} {description}".lower()
  599. for attr in check_attrs:
  600. if attr not in attributes or not attributes[attr]:
  601. continue
  602. total_checks += 1
  603. attr_value = str(attributes[attr]).lower().strip()
  604. # Skip very short values
  605. if len(attr_value) < 2:
  606. consistency_count += 1
  607. continue
  608. # Direct substring match
  609. if attr_value in combined_text:
  610. consistency_count += 1
  611. continue
  612. # Word boundary match
  613. words_in_text = set(combined_text.split())
  614. words_in_attr = set(attr_value.split())
  615. if words_in_attr.issubset(words_in_text):
  616. consistency_count += 1
  617. continue
  618. # Fuzzy word matching
  619. text_words = combined_text.split()
  620. matches = 0
  621. for attr_word in words_in_attr:
  622. match = process.extractOne(attr_word, text_words, scorer=fuzz.ratio)
  623. if match and match[1] >= 80:
  624. matches += 1
  625. if matches / len(words_in_attr) >= 0.7:
  626. consistency_count += 1
  627. continue
  628. issues.append(f"'{attr.title()}': '{attributes[attr]}' not clearly mentioned in title/description")
  629. score = (consistency_count / total_checks) * 100 if total_checks > 0 else 100.0
  630. return score, issues
  631. def extract_attributes_from_text(self, text: str, category: str = '') -> Dict:
  632. """Enhanced attribute extraction with AI fallback"""
  633. extracted = {}
  634. # Try spaCy first
  635. if self.nlp:
  636. extracted = self._extract_with_spacy(text)
  637. # Use AI if available and spaCy found little
  638. if self.use_ai and len(extracted) < 3:
  639. try:
  640. ai_extracted = self.ai_service.extract_attributes_with_ai(text, '', category)
  641. extracted.update({k: v for k, v in ai_extracted.items() if v})
  642. except Exception as e:
  643. logger.error(f"AI extraction failed: {e}")
  644. return extracted
  645. def _extract_with_spacy(self, text: str) -> Dict:
  646. """Extract using spaCy NER"""
  647. doc = self.nlp(text)
  648. extracted = defaultdict(list)
  649. for ent in doc.ents:
  650. if ent.label_ == 'ORG':
  651. extracted['brand'].append(ent.text)
  652. elif ent.label_ == 'QUANTITY':
  653. extracted['size'].append(ent.text)
  654. elif ent.label_ == 'PRODUCT':
  655. extracted['product_type'].append(ent.text)
  656. # Color detection
  657. colors = ['red', 'blue', 'green', 'black', 'white', 'yellow', 'orange',
  658. 'purple', 'pink', 'brown', 'gray', 'grey', 'silver', 'gold']
  659. text_lower = text.lower()
  660. for color in colors:
  661. if color in text_lower:
  662. extracted['color'].append(color.title())
  663. # Return most common value
  664. result = {}
  665. for key, values in extracted.items():
  666. if values:
  667. result[key] = max(set(values), key=values.count)
  668. return result