attribute_scorer.py 46 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206
  1. # # attribute_scorer_integrated.py
  2. # """
  3. # Enhanced AttributeQualityScorer with Title/Description scoring integrated
  4. # """
  5. # import re # <-- ADDED: Required for regex patterns
  6. # import time
  7. # import logging
  8. # from typing import Dict, List, Tuple
  9. # from rapidfuzz import fuzz, process
  10. # from collections import defaultdict
  11. # logger = logging.getLogger(__name__)
  12. # class AttributeQualityScorer:
  13. # """
  14. # Complete Product Quality Scorer with ALL components:
  15. # - Mandatory Fields (25%)
  16. # - Standardization (20%)
  17. # - Missing Values (13%)
  18. # - Consistency (7%)
  19. # - SEO Discoverability (10%)
  20. # - Title Quality (10%)
  21. # - Description Quality (15%)
  22. # """
  23. # def __init__(self, use_ai: bool = True, use_seo: bool = True):
  24. # # Load spaCy model (optional, for advanced NER)
  25. # self.nlp = None
  26. # try:
  27. # import spacy
  28. # self.nlp = spacy.load("en_core_web_sm")
  29. # logger.info("spaCy model loaded successfully")
  30. # except Exception as e:
  31. # logger.warning(f"spaCy model not loaded: {e}. Will use fallback methods.")
  32. # # Initialize AI service
  33. # self.use_ai = use_ai
  34. # self.ai_service = None
  35. # if use_ai:
  36. # try:
  37. # from .gemini_service import GeminiAttributeService
  38. # self.ai_service = GeminiAttributeService()
  39. # logger.info("Gemini AI service initialized")
  40. # except Exception as e:
  41. # logger.warning(f"Gemini service not available: {e}")
  42. # self.use_ai = False
  43. # # Initialize SEO scorer
  44. # self.use_seo = use_seo
  45. # self.seo_scorer = None
  46. # if use_seo:
  47. # try:
  48. # from .seo_scorer import SEODiscoverabilityScorer
  49. # self.seo_scorer = SEODiscoverabilityScorer()
  50. # logger.info("SEO scorer initialized")
  51. # except Exception as e:
  52. # logger.warning(f"SEO scorer not available: {e}")
  53. # self.use_seo = False
  54. # # Initialize Title/Description scorer
  55. # self.title_desc_scorer = None
  56. # try:
  57. # from .title_description_scorer import TitleDescriptionScorer
  58. # self.title_desc_scorer = TitleDescriptionScorer(use_ai=use_ai)
  59. # logger.info("Title/Description scorer initialized")
  60. # except Exception as e:
  61. # logger.warning(f"Title/Description scorer not available: {e}")
  62. # # UPDATED WEIGHTS (Total = 100%)
  63. # self.weights = {
  64. # 'mandatory_fields': 0.25, # 25%
  65. # 'standardization': 0.20, # 20%
  66. # 'missing_values': 0.13, # 13%
  67. # 'consistency': 0.07, # 7%
  68. # 'seo_discoverability': 0.10, # 10%
  69. # 'title_quality': 0.10, # 10% (NEW)
  70. # 'description_quality': 0.15 # 15% (NEW)
  71. # }
  72. # def score_product(
  73. # self,
  74. # product: Dict,
  75. # category_rules: List[Dict],
  76. # generate_ai_suggestions: bool = True
  77. # ) -> Dict:
  78. # """
  79. # Complete product scoring with all components
  80. # Args:
  81. # product: Product dict with sku, category, title, description, attributes
  82. # category_rules: List of CategoryAttributeRule dicts
  83. # generate_ai_suggestions: Whether to call AI for suggestions
  84. # Returns:
  85. # Dict with final_score, breakdown, issues, suggestions, etc.
  86. # """
  87. # start_time = time.time()
  88. # attributes = product.get('attributes', {})
  89. # category = product.get('category', '')
  90. # # Initialize scores
  91. # scores = {
  92. # 'mandatory_fields': 0,
  93. # 'standardization': 0,
  94. # 'missing_values': 0,
  95. # 'consistency': 0,
  96. # 'seo_discoverability': 0,
  97. # 'title_quality': 0,
  98. # 'description_quality': 0
  99. # }
  100. # issues = []
  101. # suggestions = []
  102. # # --- 1. Mandatory Fields (25%) ---
  103. # try:
  104. # mandatory_score, mandatory_issues, mandatory_suggestions = self._check_mandatory_fields(
  105. # attributes, category_rules
  106. # )
  107. # scores['mandatory_fields'] = mandatory_score
  108. # issues.extend(mandatory_issues)
  109. # suggestions.extend(mandatory_suggestions)
  110. # except Exception as e:
  111. # logger.error(f"[Mandatory Fields] Error: {e}", exc_info=True)
  112. # scores['mandatory_fields'] = 0
  113. # # --- 2. Standardization (20%) ---
  114. # try:
  115. # std_score, std_issues, std_suggestions = self._check_standardization(
  116. # attributes, category_rules
  117. # )
  118. # scores['standardization'] = std_score
  119. # issues.extend(std_issues)
  120. # suggestions.extend(std_suggestions)
  121. # except Exception as e:
  122. # logger.error(f"[Standardization] Error: {e}", exc_info=True)
  123. # scores['standardization'] = 0
  124. # # --- 3. Missing Values (13%) ---
  125. # try:
  126. # missing_score, missing_issues, missing_suggestions = self._check_missing_values(
  127. # attributes, category_rules
  128. # )
  129. # scores['missing_values'] = missing_score
  130. # issues.extend(missing_issues)
  131. # suggestions.extend(missing_suggestions)
  132. # except Exception as e:
  133. # logger.error(f"[Missing Values] Error: {e}", exc_info=True)
  134. # scores['missing_values'] = 0
  135. # # --- 4. Consistency (7%) ---
  136. # try:
  137. # consistency_score, consistency_issues = self._check_consistency(
  138. # attributes,
  139. # product.get('title', ''),
  140. # product.get('description', '')
  141. # )
  142. # scores['consistency'] = consistency_score
  143. # issues.extend(consistency_issues)
  144. # except Exception as e:
  145. # logger.error(f"[Consistency] Error: {e}", exc_info=True)
  146. # scores['consistency'] = 0
  147. # # --- 5. SEO Discoverability (10%) ---
  148. # seo_result = None
  149. # if self.use_seo and self.seo_scorer:
  150. # try:
  151. # seo_result = self.seo_scorer.score_seo(product, category_rules)
  152. # scores['seo_discoverability'] = seo_result['seo_score']
  153. # issues.extend(seo_result['issues'])
  154. # suggestions.extend(seo_result['suggestions'])
  155. # except Exception as e:
  156. # logger.error(f"[SEO Scoring] Error: {e}", exc_info=True)
  157. # scores['seo_discoverability'] = 0
  158. # # --- 6. Title & Description Quality (25% combined) ---
  159. # title_desc_result = None
  160. # if self.title_desc_scorer:
  161. # try:
  162. # title_desc_result = self.title_desc_scorer.score_title_and_description(
  163. # product,
  164. # category_rules
  165. # )
  166. # scores['title_quality'] = title_desc_result['title_score']
  167. # scores['description_quality'] = title_desc_result['description_score']
  168. # issues.extend(title_desc_result['issues'])
  169. # suggestions.extend(title_desc_result['suggestions'])
  170. # except Exception as e:
  171. # logger.error(f"[Title/Description Scoring] Error: {e}", exc_info=True)
  172. # scores['title_quality'] = 0
  173. # scores['description_quality'] = 0
  174. # # --- Final Score Calculation ---
  175. # final_score = sum(scores[key] * self.weights[key] for key in scores)
  176. # result = {
  177. # 'final_score': round(final_score, 2),
  178. # 'max_score': 100.0,
  179. # 'breakdown': scores,
  180. # 'issues': issues,
  181. # 'suggestions': suggestions,
  182. # 'weights': self.weights,
  183. # 'processing_time': round(time.time() - start_time, 3)
  184. # }
  185. # # Add component-specific details
  186. # if seo_result:
  187. # result['seo_details'] = {
  188. # 'breakdown': seo_result['breakdown'],
  189. # 'extracted_keywords': seo_result.get('extracted_keywords', []),
  190. # 'missing_high_value_terms': seo_result.get('missing_high_value_terms', [])
  191. # }
  192. # if title_desc_result:
  193. # result['title_description_details'] = {
  194. # 'title_breakdown': title_desc_result.get('title_breakdown', {}),
  195. # 'description_breakdown': title_desc_result.get('description_breakdown', {}),
  196. # 'ai_improvements': title_desc_result.get('ai_improvements')
  197. # }
  198. # # --- AI Suggestions (Comprehensive) ---
  199. # if generate_ai_suggestions and self.use_ai and self.ai_service:
  200. # try:
  201. # logger.info(f"Generating comprehensive AI suggestions for SKU: {product.get('sku')}")
  202. # ai_suggestions = self.ai_service.generate_comprehensive_suggestions(
  203. # product,
  204. # issues,
  205. # category_rules,
  206. # scores
  207. # )
  208. # result['ai_suggestions'] = ai_suggestions
  209. # except Exception as e:
  210. # logger.error(f"[AI Suggestions] Error: {e}", exc_info=True)
  211. # result['ai_suggestions'] = {'error': str(e)}
  212. # return result
  213. # # ========== ATTRIBUTE VALIDATION METHODS ==========
  214. # def _check_mandatory_fields(
  215. # self,
  216. # attributes: Dict,
  217. # rules: List[Dict]
  218. # ) -> Tuple[float, List[str], List[str]]:
  219. # """
  220. # Check if all mandatory fields are present and valid
  221. # Returns:
  222. # Tuple of (score, issues, suggestions)
  223. # """
  224. # mandatory_rules = [r for r in rules if r.get('is_mandatory', False)]
  225. # if not mandatory_rules:
  226. # return 100.0, [], []
  227. # present_count = 0
  228. # issues = []
  229. # suggestions = []
  230. # for rule in mandatory_rules:
  231. # attr_name = rule['attribute_name']
  232. # if attr_name in attributes and attributes[attr_name]:
  233. # value = str(attributes[attr_name]).strip()
  234. # if not value:
  235. # issues.append(f"Mandatory field '{attr_name}' is empty")
  236. # suggestions.append(f"Provide a non-empty value for {attr_name}")
  237. # continue
  238. # # Check length constraints
  239. # min_len = rule.get('min_length')
  240. # max_len = rule.get('max_length')
  241. # if min_len and len(value) < min_len:
  242. # issues.append(f"'{attr_name}' too short (min: {min_len} chars)")
  243. # suggestions.append(f"Expand {attr_name} to at least {min_len} characters")
  244. # continue
  245. # if max_len and len(value) > max_len:
  246. # issues.append(f"'{attr_name}' too long (max: {max_len} chars)")
  247. # suggestions.append(f"Shorten {attr_name} to {max_len} characters or less")
  248. # continue
  249. # # Check regex pattern if provided
  250. # regex = rule.get('validation_regex')
  251. # if regex:
  252. # try:
  253. # if not re.match(regex, value):
  254. # issues.append(f"'{attr_name}' format invalid")
  255. # suggestions.append(f"Ensure {attr_name} matches required format")
  256. # continue
  257. # except re.error:
  258. # logger.warning(f"Invalid regex pattern for {attr_name}: {regex}")
  259. # present_count += 1
  260. # else:
  261. # issues.append(f"Missing mandatory field: {attr_name}")
  262. # desc = rule.get('description', '')
  263. # if desc:
  264. # suggestions.append(f"Add {attr_name}: {desc}")
  265. # else:
  266. # suggestions.append(f"Add required attribute: {attr_name}")
  267. # score = (present_count / len(mandatory_rules)) * 100 if mandatory_rules else 100.0
  268. # return score, issues, suggestions
  269. # def _check_standardization(
  270. # self,
  271. # attributes: Dict,
  272. # rules: List[Dict]
  273. # ) -> Tuple[float, List[str], List[str]]:
  274. # """
  275. # Check if attribute values match standardized valid values
  276. # Returns:
  277. # Tuple of (score, issues, suggestions)
  278. # """
  279. # standardized_rules = [r for r in rules if r.get('valid_values')]
  280. # if not standardized_rules:
  281. # return 100.0, [], []
  282. # correct_count = 0
  283. # issues = []
  284. # suggestions = []
  285. # for rule in standardized_rules:
  286. # attr_name = rule['attribute_name']
  287. # valid_values = rule['valid_values']
  288. # if not valid_values: # Skip if empty valid_values list
  289. # continue
  290. # if attr_name not in attributes or not attributes[attr_name]:
  291. # continue
  292. # actual_value = str(attributes[attr_name]).strip()
  293. # if not actual_value:
  294. # continue
  295. # # Exact match
  296. # if actual_value in valid_values:
  297. # correct_count += 1
  298. # continue
  299. # # Case-insensitive match
  300. # lower_valid = {v.lower(): v for v in valid_values}
  301. # if actual_value.lower() in lower_valid:
  302. # correct_count += 1
  303. # correct_value = lower_valid[actual_value.lower()]
  304. # if actual_value != correct_value:
  305. # issues.append(
  306. # f"{attr_name}: Case mismatch - '{actual_value}' should be '{correct_value}'"
  307. # )
  308. # suggestions.append(f"Correct capitalization of {attr_name} to: {correct_value}")
  309. # continue
  310. # # Fuzzy matching
  311. # best_match = process.extractOne(actual_value, valid_values, scorer=fuzz.ratio)
  312. # if best_match and best_match[1] >= 80:
  313. # correct_count += 1
  314. # if best_match[1] < 100:
  315. # issues.append(
  316. # f"{attr_name}: '{actual_value}' likely means '{best_match[0]}' "
  317. # f"(confidence: {best_match[1]}%)"
  318. # )
  319. # suggestions.append(f"Standardize {attr_name} to: {best_match[0]}")
  320. # else:
  321. # issues.append(
  322. # f"{attr_name}: '{actual_value}' not recognized. "
  323. # f"Valid: {', '.join(valid_values[:3])}"
  324. # )
  325. # suggestions.append(f"Change {attr_name} to one of: {', '.join(valid_values[:3])}")
  326. # score = (correct_count / len(standardized_rules)) * 100 if standardized_rules else 100.0
  327. # return score, issues, suggestions
  328. # def _check_missing_values(
  329. # self,
  330. # attributes: Dict,
  331. # rules: List[Dict]
  332. # ) -> Tuple[float, List[str], List[str]]:
  333. # """
  334. # Check for placeholder values or empty attributes
  335. # Returns:
  336. # Tuple of (score, issues, suggestions)
  337. # """
  338. # placeholder_patterns = [
  339. # r'^n/?a$', r'^none$', r'^null$', r'^-+$', r'^\.+$',
  340. # r'^tbd$', r'^to be determined$', r'^unknown$', r'^na$',
  341. # r'^todo$', r'^pending$', r'^\?+$', r'^xxx+$', r'^placeholder$'
  342. # ]
  343. # total_attrs = len(rules)
  344. # valid_count = 0
  345. # issues = []
  346. # suggestions = []
  347. # for rule in rules:
  348. # attr_name = rule['attribute_name']
  349. # if attr_name not in attributes:
  350. # continue
  351. # value = str(attributes[attr_name]).strip()
  352. # if not value:
  353. # issues.append(f"'{attr_name}' is empty")
  354. # suggestions.append(f"Provide a valid value for {attr_name}")
  355. # continue
  356. # value_lower = value.lower()
  357. # is_placeholder = any(
  358. # re.match(pattern, value_lower, re.IGNORECASE)
  359. # for pattern in placeholder_patterns
  360. # )
  361. # if is_placeholder:
  362. # issues.append(f"'{attr_name}' contains placeholder: '{value}'")
  363. # suggestions.append(f"Replace placeholder in {attr_name} with actual data")
  364. # continue
  365. # # Check for suspiciously short values on mandatory fields
  366. # if rule.get('is_mandatory') and len(value) < 2:
  367. # issues.append(f"'{attr_name}' suspiciously short: '{value}'")
  368. # suggestions.append(f"Provide more detailed {attr_name}")
  369. # continue
  370. # valid_count += 1
  371. # score = (valid_count / total_attrs) * 100 if total_attrs > 0 else 100.0
  372. # return score, issues, suggestions
  373. # def _check_consistency(
  374. # self,
  375. # attributes: Dict,
  376. # title: str,
  377. # description: str
  378. # ) -> Tuple[float, List[str]]:
  379. # """
  380. # Check if attribute values are mentioned in title/description
  381. # Returns:
  382. # Tuple of (score, issues)
  383. # """
  384. # issues = []
  385. # consistency_count = 0
  386. # total_checks = 0
  387. # check_attrs = ['brand', 'color', 'size', 'material', 'model', 'weight', 'dimensions']
  388. # combined_text = f"{title} {description}".lower()
  389. # for attr in check_attrs:
  390. # if attr not in attributes or not attributes[attr]:
  391. # continue
  392. # total_checks += 1
  393. # attr_value = str(attributes[attr]).lower().strip()
  394. # # Skip very short values (likely abbreviations or numbers)
  395. # if len(attr_value) < 2:
  396. # consistency_count += 1
  397. # continue
  398. # # Direct substring match
  399. # if attr_value in combined_text:
  400. # consistency_count += 1
  401. # continue
  402. # # Word boundary match (for multi-word attributes)
  403. # words_in_attr = set(attr_value.split())
  404. # words_in_text = set(combined_text.split())
  405. # if words_in_attr.issubset(words_in_text):
  406. # consistency_count += 1
  407. # continue
  408. # # Fuzzy matching as last resort
  409. # text_words = combined_text.split()
  410. # if text_words:
  411. # match = process.extractOne(attr_value, text_words, scorer=fuzz.ratio)
  412. # if match and match[1] >= 80:
  413. # consistency_count += 1
  414. # continue
  415. # # If we get here, attribute is not mentioned
  416. # issues.append(
  417. # f"'{attr.title()}': '{attributes[attr]}' not mentioned in title/description"
  418. # )
  419. # score = (consistency_count / total_checks) * 100 if total_checks > 0 else 100.0
  420. # return score, issues
  421. # # ========== UTILITY METHODS ==========
  422. # def extract_attributes_from_text(self, text: str, category: str = '') -> Dict:
  423. # """
  424. # Extract attributes from unstructured text using NER and patterns
  425. # Args:
  426. # text: Product title or description
  427. # category: Product category (optional, for context)
  428. # Returns:
  429. # Dict of extracted attributes
  430. # """
  431. # extracted = {}
  432. # # Try spaCy NER if available
  433. # if self.nlp:
  434. # try:
  435. # doc = self.nlp(text)
  436. # # Extract organizations as potential brands
  437. # orgs = [ent.text for ent in doc.ents if ent.label_ == 'ORG']
  438. # if orgs:
  439. # extracted['brand'] = orgs[0]
  440. # # Extract quantities as potential sizes
  441. # quantities = [ent.text for ent in doc.ents if ent.label_ == 'QUANTITY']
  442. # if quantities:
  443. # extracted['size'] = quantities[0]
  444. # except Exception as e:
  445. # logger.warning(f"spaCy extraction failed: {e}")
  446. # # Pattern-based extraction for colors
  447. # color_patterns = [
  448. # 'black', 'white', 'red', 'blue', 'green', 'yellow', 'orange',
  449. # 'purple', 'pink', 'brown', 'gray', 'grey', 'silver', 'gold',
  450. # 'rose gold', 'space gray', 'navy', 'beige', 'tan'
  451. # ]
  452. # text_lower = text.lower()
  453. # for color in color_patterns:
  454. # if color in text_lower:
  455. # extracted['color'] = color.title()
  456. # break
  457. # # Use AI for enhanced extraction if available
  458. # if self.use_ai and self.ai_service and len(extracted) < 3:
  459. # try:
  460. # ai_extracted = self.ai_service.extract_attributes_with_ai(
  461. # text, '', category
  462. # )
  463. # # Merge AI results (don't override existing)
  464. # for key, value in ai_extracted.items():
  465. # if key not in extracted and value:
  466. # extracted[key] = value
  467. # except Exception as e:
  468. # logger.warning(f"AI extraction failed: {e}")
  469. # return extracted
  470. # def get_score_interpretation(self, score: float) -> Dict[str, str]:
  471. # """
  472. # Get human-readable interpretation of score
  473. # Args:
  474. # score: Numeric score (0-100)
  475. # Returns:
  476. # Dict with grade, status, and recommendation
  477. # """
  478. # if score >= 90:
  479. # return {
  480. # 'grade': 'A',
  481. # 'status': 'Excellent',
  482. # 'color': 'green',
  483. # 'recommendation': 'Product listing is of high quality. Minor tweaks only.'
  484. # }
  485. # elif score >= 80:
  486. # return {
  487. # 'grade': 'B',
  488. # 'status': 'Good',
  489. # 'color': 'lightgreen',
  490. # 'recommendation': 'Good quality. Address minor issues to reach excellence.'
  491. # }
  492. # elif score >= 70:
  493. # return {
  494. # 'grade': 'C',
  495. # 'status': 'Fair',
  496. # 'color': 'yellow',
  497. # 'recommendation': 'Acceptable but needs improvement. Review suggestions.'
  498. # }
  499. # elif score >= 60:
  500. # return {
  501. # 'grade': 'D',
  502. # 'status': 'Poor',
  503. # 'color': 'orange',
  504. # 'recommendation': 'Significant issues found. Requires immediate attention.'
  505. # }
  506. # else:
  507. # return {
  508. # 'grade': 'F',
  509. # 'status': 'Critical',
  510. # 'color': 'red',
  511. # 'recommendation': 'Critical quality issues. Major revision needed.'
  512. # }
  513. # attribute_scorer_integrated.py
  514. """
  515. Enhanced AttributeQualityScorer with ProductContentRule integration
  516. """
  517. import re
  518. import time
  519. import logging
  520. from typing import Dict, List, Tuple
  521. from rapidfuzz import fuzz, process
  522. from collections import defaultdict
  523. logger = logging.getLogger(__name__)
  524. class AttributeQualityScorer:
  525. """
  526. Complete Product Quality Scorer with ALL components INCLUDING ProductContentRule validation:
  527. - Mandatory Fields (20%)
  528. - Standardization (15%)
  529. - Missing Values (10%)
  530. - Consistency (5%)
  531. - SEO Discoverability (10%)
  532. - Content Rules Compliance (15%) ← NEW: Validates against ProductContentRule
  533. - Title Quality (10%)
  534. - Description Quality (15%)
  535. """
  536. def __init__(self, use_ai: bool = True, use_seo: bool = True):
  537. # Load spaCy model (optional, for advanced NER)
  538. self.nlp = None
  539. try:
  540. import spacy
  541. self.nlp = spacy.load("en_core_web_sm")
  542. logger.info("spaCy model loaded successfully")
  543. except Exception as e:
  544. logger.warning(f"spaCy model not loaded: {e}. Will use fallback methods.")
  545. # Initialize AI service
  546. self.use_ai = use_ai
  547. self.ai_service = None
  548. if use_ai:
  549. try:
  550. from .gemini_service import GeminiAttributeService
  551. self.ai_service = GeminiAttributeService()
  552. logger.info("Gemini AI service initialized")
  553. except Exception as e:
  554. logger.warning(f"Gemini service not available: {e}")
  555. self.use_ai = False
  556. # Initialize SEO scorer
  557. self.use_seo = use_seo
  558. self.seo_scorer = None
  559. if use_seo:
  560. try:
  561. from .seo_scorer import SEODiscoverabilityScorer
  562. self.seo_scorer = SEODiscoverabilityScorer()
  563. logger.info("SEO scorer initialized")
  564. except Exception as e:
  565. logger.warning(f"SEO scorer not available: {e}")
  566. self.use_seo = False
  567. # Initialize Title/Description scorer
  568. self.title_desc_scorer = None
  569. try:
  570. from .title_description_scorer import TitleDescriptionScorer
  571. self.title_desc_scorer = TitleDescriptionScorer(use_ai=use_ai)
  572. logger.info("Title/Description scorer initialized")
  573. except Exception as e:
  574. logger.warning(f"Title/Description scorer not available: {e}")
  575. # Initialize Content Rules scorer ← NEW
  576. self.content_rules_scorer = None
  577. try:
  578. from .content_rules_scorer import ContentRulesScorer
  579. self.content_rules_scorer = ContentRulesScorer()
  580. logger.info("Content Rules scorer initialized")
  581. except Exception as e:
  582. logger.warning(f"Content Rules scorer not available: {e}")
  583. # UPDATED WEIGHTS (Total = 100%)
  584. self.weights = {
  585. 'mandatory_fields': 0.20, # 20% (reduced from 25%)
  586. 'standardization': 0.15, # 15% (reduced from 20%)
  587. 'missing_values': 0.10, # 10% (reduced from 13%)
  588. 'consistency': 0.05, # 5% (reduced from 7%)
  589. 'seo_discoverability': 0.10, # 10%
  590. 'content_rules_compliance': 0.15, # 15% ← NEW: ProductContentRule validation
  591. 'title_quality': 0.10, # 10%
  592. 'description_quality': 0.15 # 15%
  593. }
  594. def score_product(
  595. self,
  596. product: Dict,
  597. category_rules: List[Dict],
  598. content_rules: List[Dict] = None, # ← NEW parameter
  599. generate_ai_suggestions: bool = True
  600. ) -> Dict:
  601. """
  602. Complete product scoring with all components including ProductContentRule validation
  603. Args:
  604. product: Product dict with sku, category, title, description, attributes
  605. category_rules: List of CategoryAttributeRule dicts
  606. content_rules: List of ProductContentRule dicts (NEW)
  607. generate_ai_suggestions: Whether to call AI for suggestions
  608. Returns:
  609. Dict with final_score, breakdown, issues, suggestions, etc.
  610. """
  611. start_time = time.time()
  612. attributes = product.get('attributes', {})
  613. category = product.get('category', '')
  614. # Initialize scores
  615. scores = {
  616. 'mandatory_fields': 0,
  617. 'standardization': 0,
  618. 'missing_values': 0,
  619. 'consistency': 0,
  620. 'seo_discoverability': 0,
  621. 'content_rules_compliance': 0, # ← NEW
  622. 'title_quality': 0,
  623. 'description_quality': 0
  624. }
  625. issues = []
  626. suggestions = []
  627. # --- 1. Mandatory Fields (20%) ---
  628. try:
  629. mandatory_score, mandatory_issues, mandatory_suggestions = self._check_mandatory_fields(
  630. attributes, category_rules
  631. )
  632. scores['mandatory_fields'] = mandatory_score
  633. issues.extend(mandatory_issues)
  634. suggestions.extend(mandatory_suggestions)
  635. except Exception as e:
  636. logger.error(f"[Mandatory Fields] Error: {e}", exc_info=True)
  637. scores['mandatory_fields'] = 0
  638. # --- 2. Standardization (15%) ---
  639. try:
  640. std_score, std_issues, std_suggestions = self._check_standardization(
  641. attributes, category_rules
  642. )
  643. scores['standardization'] = std_score
  644. issues.extend(std_issues)
  645. suggestions.extend(std_suggestions)
  646. except Exception as e:
  647. logger.error(f"[Standardization] Error: {e}", exc_info=True)
  648. scores['standardization'] = 0
  649. # --- 3. Missing Values (10%) ---
  650. try:
  651. missing_score, missing_issues, missing_suggestions = self._check_missing_values(
  652. attributes, category_rules
  653. )
  654. scores['missing_values'] = missing_score
  655. issues.extend(missing_issues)
  656. suggestions.extend(missing_suggestions)
  657. except Exception as e:
  658. logger.error(f"[Missing Values] Error: {e}", exc_info=True)
  659. scores['missing_values'] = 0
  660. # --- 4. Consistency (5%) ---
  661. try:
  662. consistency_score, consistency_issues = self._check_consistency(
  663. attributes,
  664. product.get('title', ''),
  665. product.get('description', '')
  666. )
  667. scores['consistency'] = consistency_score
  668. issues.extend(consistency_issues)
  669. except Exception as e:
  670. logger.error(f"[Consistency] Error: {e}", exc_info=True)
  671. scores['consistency'] = 0
  672. # --- 5. SEO Discoverability (10%) ---
  673. seo_result = None
  674. if self.use_seo and self.seo_scorer:
  675. try:
  676. seo_result = self.seo_scorer.score_seo(product, category_rules)
  677. scores['seo_discoverability'] = seo_result['seo_score']
  678. issues.extend(seo_result['issues'])
  679. suggestions.extend(seo_result['suggestions'])
  680. except Exception as e:
  681. logger.error(f"[SEO Scoring] Error: {e}", exc_info=True)
  682. scores['seo_discoverability'] = 0
  683. # --- 6. Content Rules Compliance (15%) ← NEW ---
  684. content_rules_result = None
  685. if content_rules and self.content_rules_scorer:
  686. try:
  687. content_rules_result = self.content_rules_scorer.score_content_fields(
  688. product, content_rules
  689. )
  690. scores['content_rules_compliance'] = content_rules_result['overall_content_score']
  691. issues.extend(content_rules_result['issues'])
  692. suggestions.extend(content_rules_result['suggestions'])
  693. except Exception as e:
  694. logger.error(f"[Content Rules] Error: {e}", exc_info=True)
  695. scores['content_rules_compliance'] = 0
  696. else:
  697. # If no content rules provided, give neutral score
  698. scores['content_rules_compliance'] = 100.0
  699. # --- 7. Title & Description Quality (25% combined) ---
  700. title_desc_result = None
  701. if self.title_desc_scorer:
  702. try:
  703. title_desc_result = self.title_desc_scorer.score_title_and_description(
  704. product,
  705. category_rules
  706. )
  707. scores['title_quality'] = title_desc_result['title_score']
  708. scores['description_quality'] = title_desc_result['description_score']
  709. issues.extend(title_desc_result['issues'])
  710. suggestions.extend(title_desc_result['suggestions'])
  711. except Exception as e:
  712. logger.error(f"[Title/Description Scoring] Error: {e}", exc_info=True)
  713. scores['title_quality'] = 0
  714. scores['description_quality'] = 0
  715. # --- Final Score Calculation ---
  716. final_score = sum(scores[key] * self.weights[key] for key in scores)
  717. result = {
  718. 'final_score': round(final_score, 2),
  719. 'max_score': 100.0,
  720. 'breakdown': scores,
  721. 'issues': issues,
  722. 'suggestions': suggestions,
  723. 'weights': self.weights,
  724. 'processing_time': round(time.time() - start_time, 3)
  725. }
  726. # Add component-specific details
  727. if seo_result:
  728. result['seo_details'] = {
  729. 'breakdown': seo_result['breakdown'],
  730. 'extracted_keywords': seo_result.get('extracted_keywords', []),
  731. 'missing_high_value_terms': seo_result.get('missing_high_value_terms', [])
  732. }
  733. if content_rules_result:
  734. result['content_rules_details'] = {
  735. 'field_scores': content_rules_result['field_scores'],
  736. 'rules_applied': content_rules_result['rules_applied']
  737. }
  738. if title_desc_result:
  739. result['title_description_details'] = {
  740. 'title_breakdown': title_desc_result.get('title_breakdown', {}),
  741. 'description_breakdown': title_desc_result.get('description_breakdown', {}),
  742. 'ai_improvements': title_desc_result.get('ai_improvements')
  743. }
  744. # --- AI Suggestions (Comprehensive) ---
  745. if generate_ai_suggestions and self.use_ai and self.ai_service:
  746. try:
  747. logger.info(f"Generating comprehensive AI suggestions for SKU: {product.get('sku')}")
  748. ai_suggestions = self.ai_service.generate_comprehensive_suggestions(
  749. product,
  750. issues,
  751. category_rules,
  752. scores
  753. )
  754. result['ai_suggestions'] = ai_suggestions
  755. except Exception as e:
  756. logger.error(f"[AI Suggestions] Error: {e}", exc_info=True)
  757. result['ai_suggestions'] = {'error': str(e)}
  758. return result
  759. # ========== ATTRIBUTE VALIDATION METHODS (unchanged) ==========
  760. def _check_mandatory_fields(
  761. self,
  762. attributes: Dict,
  763. rules: List[Dict]
  764. ) -> Tuple[float, List[str], List[str]]:
  765. """Check if all mandatory fields are present and valid"""
  766. mandatory_rules = [r for r in rules if r.get('is_mandatory', False)]
  767. if not mandatory_rules:
  768. return 100.0, [], []
  769. present_count = 0
  770. issues = []
  771. suggestions = []
  772. for rule in mandatory_rules:
  773. attr_name = rule['attribute_name']
  774. if attr_name in attributes and attributes[attr_name]:
  775. value = str(attributes[attr_name]).strip()
  776. if not value:
  777. issues.append(f"Mandatory field '{attr_name}' is empty")
  778. suggestions.append(f"Provide a non-empty value for {attr_name}")
  779. continue
  780. # Check length constraints
  781. min_len = rule.get('min_length')
  782. max_len = rule.get('max_length')
  783. if min_len and len(value) < min_len:
  784. issues.append(f"'{attr_name}' too short (min: {min_len} chars)")
  785. suggestions.append(f"Expand {attr_name} to at least {min_len} characters")
  786. continue
  787. if max_len and len(value) > max_len:
  788. issues.append(f"'{attr_name}' too long (max: {max_len} chars)")
  789. suggestions.append(f"Shorten {attr_name} to {max_len} characters or less")
  790. continue
  791. # Check regex pattern if provided
  792. regex = rule.get('validation_regex')
  793. if regex:
  794. try:
  795. if not re.match(regex, value):
  796. issues.append(f"'{attr_name}' format invalid")
  797. suggestions.append(f"Ensure {attr_name} matches required format")
  798. continue
  799. except re.error:
  800. logger.warning(f"Invalid regex pattern for {attr_name}: {regex}")
  801. present_count += 1
  802. else:
  803. issues.append(f"Missing mandatory field: {attr_name}")
  804. desc = rule.get('description', '')
  805. if desc:
  806. suggestions.append(f"Add {attr_name}: {desc}")
  807. else:
  808. suggestions.append(f"Add required attribute: {attr_name}")
  809. score = (present_count / len(mandatory_rules)) * 100 if mandatory_rules else 100.0
  810. return score, issues, suggestions
  811. def _check_standardization(
  812. self,
  813. attributes: Dict,
  814. rules: List[Dict]
  815. ) -> Tuple[float, List[str], List[str]]:
  816. """Check if attribute values match standardized valid values"""
  817. standardized_rules = [r for r in rules if r.get('valid_values')]
  818. if not standardized_rules:
  819. return 100.0, [], []
  820. correct_count = 0
  821. issues = []
  822. suggestions = []
  823. for rule in standardized_rules:
  824. attr_name = rule['attribute_name']
  825. valid_values = rule['valid_values']
  826. if not valid_values:
  827. continue
  828. if attr_name not in attributes or not attributes[attr_name]:
  829. continue
  830. actual_value = str(attributes[attr_name]).strip()
  831. if not actual_value:
  832. continue
  833. # Exact match
  834. if actual_value in valid_values:
  835. correct_count += 1
  836. continue
  837. # Case-insensitive match
  838. lower_valid = {v.lower(): v for v in valid_values}
  839. if actual_value.lower() in lower_valid:
  840. correct_count += 1
  841. correct_value = lower_valid[actual_value.lower()]
  842. if actual_value != correct_value:
  843. issues.append(
  844. f"{attr_name}: Case mismatch - '{actual_value}' should be '{correct_value}'"
  845. )
  846. suggestions.append(f"Correct capitalization of {attr_name} to: {correct_value}")
  847. continue
  848. # Fuzzy matching
  849. best_match = process.extractOne(actual_value, valid_values, scorer=fuzz.ratio)
  850. if best_match and best_match[1] >= 80:
  851. correct_count += 1
  852. if best_match[1] < 100:
  853. issues.append(
  854. f"{attr_name}: '{actual_value}' likely means '{best_match[0]}' "
  855. f"(confidence: {best_match[1]}%)"
  856. )
  857. suggestions.append(f"Standardize {attr_name} to: {best_match[0]}")
  858. else:
  859. issues.append(
  860. f"{attr_name}: '{actual_value}' not recognized. "
  861. f"Valid: {', '.join(valid_values[:3])}"
  862. )
  863. suggestions.append(f"Change {attr_name} to one of: {', '.join(valid_values[:3])}")
  864. score = (correct_count / len(standardized_rules)) * 100 if standardized_rules else 100.0
  865. return score, issues, suggestions
  866. def _check_missing_values(
  867. self,
  868. attributes: Dict,
  869. rules: List[Dict]
  870. ) -> Tuple[float, List[str], List[str]]:
  871. """Check for placeholder values or empty attributes"""
  872. placeholder_patterns = [
  873. r'^n/?a$', r'^none$', r'^null$', r'^-+$', r'^\.+$',
  874. r'^tbd$', r'^to be determined$', r'^unknown$', r'^na$',
  875. r'^todo$', r'^pending$', r'^\?+$', r'^xxx+$', r'^placeholder$'
  876. ]
  877. total_attrs = len(rules)
  878. valid_count = 0
  879. issues = []
  880. suggestions = []
  881. for rule in rules:
  882. attr_name = rule['attribute_name']
  883. if attr_name not in attributes:
  884. continue
  885. value = str(attributes[attr_name]).strip()
  886. if not value:
  887. issues.append(f"'{attr_name}' is empty")
  888. suggestions.append(f"Provide a valid value for {attr_name}")
  889. continue
  890. value_lower = value.lower()
  891. is_placeholder = any(
  892. re.match(pattern, value_lower, re.IGNORECASE)
  893. for pattern in placeholder_patterns
  894. )
  895. if is_placeholder:
  896. issues.append(f"'{attr_name}' contains placeholder: '{value}'")
  897. suggestions.append(f"Replace placeholder in {attr_name} with actual data")
  898. continue
  899. # Check for suspiciously short values on mandatory fields
  900. if rule.get('is_mandatory') and len(value) < 2:
  901. issues.append(f"'{attr_name}' suspiciously short: '{value}'")
  902. suggestions.append(f"Provide more detailed {attr_name}")
  903. continue
  904. valid_count += 1
  905. score = (valid_count / total_attrs) * 100 if total_attrs > 0 else 100.0
  906. return score, issues, suggestions
  907. def _check_consistency(
  908. self,
  909. attributes: Dict,
  910. title: str,
  911. description: str
  912. ) -> Tuple[float, List[str]]:
  913. """Check if attribute values are mentioned in title/description"""
  914. issues = []
  915. consistency_count = 0
  916. total_checks = 0
  917. check_attrs = ['brand', 'color', 'size', 'material', 'model', 'weight', 'dimensions']
  918. combined_text = f"{title} {description}".lower()
  919. for attr in check_attrs:
  920. if attr not in attributes or not attributes[attr]:
  921. continue
  922. total_checks += 1
  923. attr_value = str(attributes[attr]).lower().strip()
  924. if len(attr_value) < 2:
  925. consistency_count += 1
  926. continue
  927. if attr_value in combined_text:
  928. consistency_count += 1
  929. continue
  930. words_in_attr = set(attr_value.split())
  931. words_in_text = set(combined_text.split())
  932. if words_in_attr.issubset(words_in_text):
  933. consistency_count += 1
  934. continue
  935. text_words = combined_text.split()
  936. if text_words:
  937. match = process.extractOne(attr_value, text_words, scorer=fuzz.ratio)
  938. if match and match[1] >= 80:
  939. consistency_count += 1
  940. continue
  941. issues.append(
  942. f"'{attr.title()}': '{attributes[attr]}' not mentioned in title/description"
  943. )
  944. score = (consistency_count / total_checks) * 100 if total_checks > 0 else 100.0
  945. return score, issues
  946. # ========== UTILITY METHODS (unchanged) ==========
  947. def extract_attributes_from_text(self, text: str, category: str = '') -> Dict:
  948. """Extract attributes from unstructured text using NER and patterns"""
  949. extracted = {}
  950. if self.nlp:
  951. try:
  952. doc = self.nlp(text)
  953. orgs = [ent.text for ent in doc.ents if ent.label_ == 'ORG']
  954. if orgs:
  955. extracted['brand'] = orgs[0]
  956. quantities = [ent.text for ent in doc.ents if ent.label_ == 'QUANTITY']
  957. if quantities:
  958. extracted['size'] = quantities[0]
  959. except Exception as e:
  960. logger.warning(f"spaCy extraction failed: {e}")
  961. # Pattern-based extraction for colors
  962. color_patterns = [
  963. 'black', 'white', 'red', 'blue', 'green', 'yellow', 'orange',
  964. 'purple', 'pink', 'brown', 'gray', 'grey', 'silver', 'gold',
  965. 'rose gold', 'space gray', 'navy', 'beige', 'tan'
  966. ]
  967. text_lower = text.lower()
  968. for color in color_patterns:
  969. if color in text_lower:
  970. extracted['color'] = color.title()
  971. break
  972. # Use AI for enhanced extraction if available
  973. if self.use_ai and self.ai_service and len(extracted) < 3:
  974. try:
  975. ai_extracted = self.ai_service.extract_attributes_with_ai(
  976. text, '', category
  977. )
  978. for key, value in ai_extracted.items():
  979. if key not in extracted and value:
  980. extracted[key] = value
  981. except Exception as e:
  982. logger.warning(f"AI extraction failed: {e}")
  983. return extracted
  984. def get_score_interpretation(self, score: float) -> Dict[str, str]:
  985. """Get human-readable interpretation of score"""
  986. if score >= 90:
  987. return {
  988. 'grade': 'A',
  989. 'status': 'Excellent',
  990. 'color': 'green',
  991. 'recommendation': 'Product listing is of high quality. Minor tweaks only.'
  992. }
  993. elif score >= 80:
  994. return {
  995. 'grade': 'B',
  996. 'status': 'Good',
  997. 'color': 'lightgreen',
  998. 'recommendation': 'Good quality. Address minor issues to reach excellence.'
  999. }
  1000. elif score >= 70:
  1001. return {
  1002. 'grade': 'C',
  1003. 'status': 'Fair',
  1004. 'color': 'yellow',
  1005. 'recommendation': 'Acceptable but needs improvement. Review suggestions.'
  1006. }
  1007. elif score >= 60:
  1008. return {
  1009. 'grade': 'D',
  1010. 'status': 'Poor',
  1011. 'color': 'orange',
  1012. 'recommendation': 'Significant issues found. Requires immediate attention.'
  1013. }
  1014. else:
  1015. return {
  1016. 'grade': 'F',
  1017. 'status': 'Critical',
  1018. 'color': 'red',
  1019. 'recommendation': 'Critical quality issues. Major revision needed.'
  1020. }