attribute_scorer.py 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943
  1. # # attribute_scorer_integrated.py
  2. # """
  3. # Enhanced AttributeQualityScorer with ProductContentRule integration
  4. # """
  5. # import re
  6. # import time
  7. # import logging
  8. # from typing import Dict, List, Tuple
  9. # from rapidfuzz import fuzz, process
  10. # from collections import defaultdict
  11. # logger = logging.getLogger(__name__)
  12. # class AttributeQualityScorer:
  13. # """
  14. # Optimized Product Quality Scorer with safe handling of optional components:
  15. # - Mandatory Fields (16%)
  16. # - Standardization (12%)
  17. # - Missing Values (8%)
  18. # - Consistency (4%)
  19. # - SEO Discoverability (8%)
  20. # - Content Rules Compliance (12%)
  21. # - Title Quality (8%)
  22. # - Description Quality (12%)
  23. # - Image Quality (20%)
  24. # Optional components do not inflate final score; weights are rescaled proportionally.
  25. # """
  26. # def __init__(self, use_ai: bool = True, use_seo: bool = True):
  27. # # spaCy for NER
  28. # self.nlp = None
  29. # try:
  30. # import spacy
  31. # self.nlp = spacy.load("en_core_web_sm")
  32. # logger.info("spaCy model loaded successfully")
  33. # except Exception as e:
  34. # logger.warning(f"spaCy not loaded: {e}")
  35. # # AI Service
  36. # self.use_ai = use_ai
  37. # self.ai_service = None
  38. # if use_ai:
  39. # try:
  40. # from .gemini_service import GeminiAttributeService
  41. # self.ai_service = GeminiAttributeService()
  42. # except Exception as e:
  43. # logger.warning(f"AI service not available: {e}")
  44. # self.use_ai = False
  45. # # Image scorer
  46. # self.image_scorer = None
  47. # try:
  48. # from .image_scorer import ImageQualityScorer
  49. # self.image_scorer = ImageQualityScorer()
  50. # except Exception as e:
  51. # logger.warning(f"Image scorer not available: {e}")
  52. # # SEO scorer
  53. # self.use_seo = use_seo
  54. # self.seo_scorer = None
  55. # if use_seo:
  56. # try:
  57. # from .seo_scorer import SEODiscoverabilityScorer
  58. # self.seo_scorer = SEODiscoverabilityScorer()
  59. # except Exception as e:
  60. # logger.warning(f"SEO scorer not available: {e}")
  61. # self.use_seo = False
  62. # # Title/Description scorer
  63. # self.title_desc_scorer = None
  64. # try:
  65. # from .title_description_scorer import TitleDescriptionScorer
  66. # self.title_desc_scorer = TitleDescriptionScorer(use_ai=use_ai)
  67. # except Exception as e:
  68. # logger.warning(f"Title/Description scorer not available: {e}")
  69. # # Content Rules scorer
  70. # self.content_rules_scorer = None
  71. # try:
  72. # from .content_rules_scorer import ContentRulesScorer
  73. # self.content_rules_scorer = ContentRulesScorer()
  74. # except Exception as e:
  75. # logger.warning(f"Content Rules scorer not available: {e}")
  76. # # Base weights
  77. # self.weights = {
  78. # 'mandatory_fields': 0.16,
  79. # 'standardization': 0.12,
  80. # 'missing_values': 0.08,
  81. # 'consistency': 0.04,
  82. # 'seo_discoverability': 0.08,
  83. # 'content_rules_compliance': 0.12,
  84. # 'title_quality': 0.08,
  85. # 'description_quality': 0.12,
  86. # 'image_quality': 0.20
  87. # }
  88. # def score_product(
  89. # self,
  90. # product: Dict,
  91. # category_rules: List[Dict],
  92. # content_rules: List[Dict] = None,
  93. # generate_ai_suggestions: bool = True
  94. # ) -> Dict:
  95. # start_time = time.time()
  96. # attributes = product.get('attributes', {})
  97. # category = product.get('category', '')
  98. # scores = {k: 0 for k in self.weights.keys()}
  99. # issues, suggestions = [], []
  100. # # --- 1. Mandatory Fields ---
  101. # try:
  102. # score, i, s = self._check_mandatory_fields(attributes, category_rules)
  103. # scores['mandatory_fields'] = score
  104. # issues.extend(i)
  105. # suggestions.extend(s)
  106. # except Exception as e:
  107. # logger.error(f"[Mandatory Fields] {e}", exc_info=True)
  108. # # --- 2. Standardization ---
  109. # try:
  110. # score, i, s = self._check_standardization(attributes, category_rules)
  111. # scores['standardization'] = score
  112. # issues.extend(i)
  113. # suggestions.extend(s)
  114. # except Exception as e:
  115. # logger.error(f"[Standardization] {e}", exc_info=True)
  116. # # --- 3. Missing Values ---
  117. # try:
  118. # score, i, s = self._check_missing_values(attributes, category_rules)
  119. # scores['missing_values'] = score
  120. # issues.extend(i)
  121. # suggestions.extend(s)
  122. # except Exception as e:
  123. # logger.error(f"[Missing Values] {e}", exc_info=True)
  124. # # --- 4. Consistency ---
  125. # try:
  126. # score, i = self._check_consistency(attributes, product.get('title', ''), product.get('description', ''))
  127. # scores['consistency'] = score
  128. # issues.extend(i)
  129. # except Exception as e:
  130. # logger.error(f"[Consistency] {e}", exc_info=True)
  131. # # --- 5. SEO ---
  132. # seo_result = None
  133. # if self.use_seo and self.seo_scorer:
  134. # try:
  135. # seo_result = self.seo_scorer.score_seo(product, category_rules)
  136. # scores['seo_discoverability'] = seo_result['seo_score']
  137. # issues.extend(seo_result['issues'])
  138. # suggestions.extend(seo_result['suggestions'])
  139. # except Exception as e:
  140. # logger.error(f"[SEO] {e}", exc_info=True)
  141. # # --- 6. Content Rules ---
  142. # content_rules_result = None
  143. # if content_rules and self.content_rules_scorer:
  144. # try:
  145. # content_rules_result = self.content_rules_scorer.score_content_fields(product, content_rules)
  146. # scores['content_rules_compliance'] = content_rules_result['overall_content_score']
  147. # issues.extend(content_rules_result['issues'])
  148. # suggestions.extend(content_rules_result['suggestions'])
  149. # except Exception as e:
  150. # logger.error(f"[Content Rules] {e}", exc_info=True)
  151. # # --- 7. Title & Description ---
  152. # title_desc_result = None
  153. # if self.title_desc_scorer:
  154. # try:
  155. # title_desc_result = self.title_desc_scorer.score_title_and_description(product, category_rules)
  156. # scores['title_quality'] = title_desc_result['title_score']
  157. # scores['description_quality'] = title_desc_result['description_score']
  158. # issues.extend(title_desc_result['issues'])
  159. # suggestions.extend(title_desc_result['suggestions'])
  160. # except Exception as e:
  161. # logger.error(f"[Title/Desc] {e}", exc_info=True)
  162. # # --- 8. Image Quality (always present in breakdown) ---
  163. # image_result = None
  164. # if self.image_scorer:
  165. # images = product.get("images", [])
  166. # if images:
  167. # try:
  168. # image_result = self.image_scorer.score_images(images)
  169. # scores['image_quality'] = image_result.get("overall_image_score", 0)
  170. # except Exception as e:
  171. # logger.error(f"[Image] {e}", exc_info=True)
  172. # scores['image_quality'] = None
  173. # else:
  174. # scores['image_quality'] = None # explicitly mark as unavailable
  175. # else:
  176. # scores['image_quality'] = None
  177. # # --- 9. Weight Rescaling for applicable numeric components ---
  178. # numeric_scores = {k: v for k, v in scores.items() if isinstance(v, (int, float))}
  179. # applicable_weights = {k: self.weights[k] for k in numeric_scores}
  180. # total_weight = sum(applicable_weights.values())
  181. # final_score = sum(numeric_scores[k] * (applicable_weights[k]/total_weight) for k in numeric_scores) if numeric_scores else None
  182. # if image_result:
  183. # result['image_details'] = image_result
  184. # # AI Suggestions
  185. # if generate_ai_suggestions and self.use_ai and self.ai_service:
  186. # try:
  187. # ai_suggestions = self.ai_service.generate_comprehensive_suggestions(
  188. # product, issues, category_rules, scores
  189. # )
  190. # result['ai_suggestions'] = ai_suggestions
  191. # except Exception as e:
  192. # logger.error(f"[AI Suggestions] {e}", exc_info=True)
  193. # result['ai_suggestions'] = {'error': str(e)}
  194. # return result
  195. # # ========== ATTRIBUTE VALIDATION METHODS (unchanged) ==========
  196. # def _check_mandatory_fields(
  197. # self,
  198. # attributes: Dict,
  199. # rules: List[Dict]
  200. # ) -> Tuple[float, List[str], List[str]]:
  201. # """Check if all mandatory fields are present and valid"""
  202. # mandatory_rules = [r for r in rules if r.get('is_mandatory', False)]
  203. # if not mandatory_rules:
  204. # return 100.0, [], []
  205. # present_count = 0
  206. # issues = []
  207. # suggestions = []
  208. # for rule in mandatory_rules:
  209. # attr_name = rule['attribute_name']
  210. # if attr_name in attributes and attributes[attr_name]:
  211. # value = str(attributes[attr_name]).strip()
  212. # if not value:
  213. # issues.append(f"Mandatory field '{attr_name}' is empty")
  214. # suggestions.append(f"Provide a non-empty value for {attr_name}")
  215. # continue
  216. # # Check length constraints
  217. # min_len = rule.get('min_length')
  218. # max_len = rule.get('max_length')
  219. # if min_len and len(value) < min_len:
  220. # issues.append(f"'{attr_name}' too short (min: {min_len} chars)")
  221. # suggestions.append(f"Expand {attr_name} to at least {min_len} characters")
  222. # continue
  223. # if max_len and len(value) > max_len:
  224. # issues.append(f"'{attr_name}' too long (max: {max_len} chars)")
  225. # suggestions.append(f"Shorten {attr_name} to {max_len} characters or less")
  226. # continue
  227. # # Check regex pattern if provided
  228. # regex = rule.get('validation_regex')
  229. # if regex:
  230. # try:
  231. # if not re.match(regex, value):
  232. # issues.append(f"'{attr_name}' format invalid")
  233. # suggestions.append(f"Ensure {attr_name} matches required format")
  234. # continue
  235. # except re.error:
  236. # logger.warning(f"Invalid regex pattern for {attr_name}: {regex}")
  237. # present_count += 1
  238. # else:
  239. # issues.append(f"Missing mandatory field: {attr_name}")
  240. # desc = rule.get('description', '')
  241. # if desc:
  242. # suggestions.append(f"Add {attr_name}: {desc}")
  243. # else:
  244. # suggestions.append(f"Add required attribute: {attr_name}")
  245. # score = (present_count / len(mandatory_rules)) * 100 if mandatory_rules else 100.0
  246. # return score, issues, suggestions
  247. # def _check_standardization(
  248. # self,
  249. # attributes: Dict,
  250. # rules: List[Dict]
  251. # ) -> Tuple[float, List[str], List[str]]:
  252. # """Check if attribute values match standardized valid values"""
  253. # standardized_rules = [r for r in rules if r.get('valid_values')]
  254. # if not standardized_rules:
  255. # return 100.0, [], []
  256. # correct_count = 0
  257. # issues = []
  258. # suggestions = []
  259. # for rule in standardized_rules:
  260. # attr_name = rule['attribute_name']
  261. # valid_values = rule['valid_values']
  262. # if not valid_values:
  263. # continue
  264. # if attr_name not in attributes or not attributes[attr_name]:
  265. # continue
  266. # actual_value = str(attributes[attr_name]).strip()
  267. # if not actual_value:
  268. # continue
  269. # # Exact match
  270. # if actual_value in valid_values:
  271. # correct_count += 1
  272. # continue
  273. # # Case-insensitive match
  274. # lower_valid = {v.lower(): v for v in valid_values}
  275. # if actual_value.lower() in lower_valid:
  276. # correct_count += 1
  277. # correct_value = lower_valid[actual_value.lower()]
  278. # if actual_value != correct_value:
  279. # issues.append(
  280. # f"{attr_name}: Case mismatch - '{actual_value}' should be '{correct_value}'"
  281. # )
  282. # suggestions.append(f"Correct capitalization of {attr_name} to: {correct_value}")
  283. # continue
  284. # # Fuzzy matching
  285. # best_match = process.extractOne(actual_value, valid_values, scorer=fuzz.ratio)
  286. # if best_match and best_match[1] >= 80:
  287. # correct_count += 1
  288. # if best_match[1] < 100:
  289. # issues.append(
  290. # f"{attr_name}: '{actual_value}' likely means '{best_match[0]}' "
  291. # f"(confidence: {best_match[1]}%)"
  292. # )
  293. # suggestions.append(f"Standardize {attr_name} to: {best_match[0]}")
  294. # else:
  295. # issues.append(
  296. # f"{attr_name}: '{actual_value}' not recognized. "
  297. # f"Valid: {', '.join(valid_values[:3])}"
  298. # )
  299. # suggestions.append(f"Change {attr_name} to one of: {', '.join(valid_values[:3])}")
  300. # score = (correct_count / len(standardized_rules)) * 100 if standardized_rules else 100.0
  301. # return score, issues, suggestions
  302. # def _check_missing_values(
  303. # self,
  304. # attributes: Dict,
  305. # rules: List[Dict]
  306. # ) -> Tuple[float, List[str], List[str]]:
  307. # """Check for placeholder values or empty attributes"""
  308. # placeholder_patterns = [
  309. # r'^n/?a$', r'^none$', r'^null$', r'^-+$', r'^\.+$',
  310. # r'^tbd$', r'^to be determined$', r'^unknown$', r'^na$',
  311. # r'^todo$', r'^pending$', r'^\?+$', r'^xxx+$', r'^placeholder$'
  312. # ]
  313. # total_attrs = len(rules)
  314. # valid_count = 0
  315. # issues = []
  316. # suggestions = []
  317. # for rule in rules:
  318. # attr_name = rule['attribute_name']
  319. # if attr_name not in attributes:
  320. # continue
  321. # value = str(attributes[attr_name]).strip()
  322. # if not value:
  323. # issues.append(f"'{attr_name}' is empty")
  324. # suggestions.append(f"Provide a valid value for {attr_name}")
  325. # continue
  326. # value_lower = value.lower()
  327. # is_placeholder = any(
  328. # re.match(pattern, value_lower, re.IGNORECASE)
  329. # for pattern in placeholder_patterns
  330. # )
  331. # if is_placeholder:
  332. # issues.append(f"'{attr_name}' contains placeholder: '{value}'")
  333. # suggestions.append(f"Replace placeholder in {attr_name} with actual data")
  334. # continue
  335. # # Check for suspiciously short values on mandatory fields
  336. # if rule.get('is_mandatory') and len(value) < 2:
  337. # issues.append(f"'{attr_name}' suspiciously short: '{value}'")
  338. # suggestions.append(f"Provide more detailed {attr_name}")
  339. # continue
  340. # valid_count += 1
  341. # score = (valid_count / total_attrs) * 100 if total_attrs > 0 else 100.0
  342. # return score, issues, suggestions
  343. # def _check_consistency(
  344. # self,
  345. # attributes: Dict,
  346. # title: str,
  347. # description: str
  348. # ) -> Tuple[float, List[str]]:
  349. # """Check if attribute values are mentioned in title/description"""
  350. # issues = []
  351. # consistency_count = 0
  352. # total_checks = 0
  353. # check_attrs = ['brand', 'color', 'size', 'material', 'model', 'weight', 'dimensions']
  354. # combined_text = f"{title} {description}".lower()
  355. # for attr in check_attrs:
  356. # if attr not in attributes or not attributes[attr]:
  357. # continue
  358. # total_checks += 1
  359. # attr_value = str(attributes[attr]).lower().strip()
  360. # if len(attr_value) < 2:
  361. # consistency_count += 1
  362. # continue
  363. # if attr_value in combined_text:
  364. # consistency_count += 1
  365. # continue
  366. # words_in_attr = set(attr_value.split())
  367. # words_in_text = set(combined_text.split())
  368. # if words_in_attr.issubset(words_in_text):
  369. # consistency_count += 1
  370. # continue
  371. # text_words = combined_text.split()
  372. # if text_words:
  373. # match = process.extractOne(attr_value, text_words, scorer=fuzz.ratio)
  374. # if match and match[1] >= 80:
  375. # consistency_count += 1
  376. # continue
  377. # issues.append(
  378. # f"'{attr.title()}': '{attributes[attr]}' not mentioned in title/description"
  379. # )
  380. # score = (consistency_count / total_checks) * 100 if total_checks > 0 else 100.0
  381. # return score, issues
  382. # # ========== UTILITY METHODS (unchanged) ==========
  383. # def extract_attributes_from_text(self, text: str, category: str = '') -> Dict:
  384. # """Extract attributes from unstructured text using NER and patterns"""
  385. # extracted = {}
  386. # if self.nlp:
  387. # try:
  388. # doc = self.nlp(text)
  389. # orgs = [ent.text for ent in doc.ents if ent.label_ == 'ORG']
  390. # if orgs:
  391. # extracted['brand'] = orgs[0]
  392. # quantities = [ent.text for ent in doc.ents if ent.label_ == 'QUANTITY']
  393. # if quantities:
  394. # extracted['size'] = quantities[0]
  395. # except Exception as e:
  396. # logger.warning(f"spaCy extraction failed: {e}")
  397. # # Pattern-based extraction for colors
  398. # color_patterns = [
  399. # 'black', 'white', 'red', 'blue', 'green', 'yellow', 'orange',
  400. # 'purple', 'pink', 'brown', 'gray', 'grey', 'silver', 'gold',
  401. # 'rose gold', 'space gray', 'navy', 'beige', 'tan'
  402. # ]
  403. # text_lower = text.lower()
  404. # for color in color_patterns:
  405. # if color in text_lower:
  406. # extracted['color'] = color.title()
  407. # break
  408. # # Use AI for enhanced extraction if available
  409. # if self.use_ai and self.ai_service and len(extracted) < 3:
  410. # try:
  411. # ai_extracted = self.ai_service.extract_attributes_with_ai(
  412. # text, '', category
  413. # )
  414. # for key, value in ai_extracted.items():
  415. # if key not in extracted and value:
  416. # extracted[key] = value
  417. # except Exception as e:
  418. # logger.warning(f"AI extraction failed: {e}")
  419. # return extracted
  420. # def get_score_interpretation(self, score: float) -> Dict[str, str]:
  421. # """Get human-readable interpretation of score"""
  422. # if score >= 90:
  423. # return {
  424. # 'grade': 'A',
  425. # 'status': 'Excellent',
  426. # 'color': 'green',
  427. # 'recommendation': 'Product listing is of high quality. Minor tweaks only.'
  428. # }
  429. # elif score >= 80:
  430. # return {
  431. # 'grade': 'B',
  432. # 'status': 'Good',
  433. # 'color': 'lightgreen',
  434. # 'recommendation': 'Good quality. Address minor issues to reach excellence.'
  435. # }
  436. # elif score >= 70:
  437. # return {
  438. # 'grade': 'C',
  439. # 'status': 'Fair',
  440. # 'color': 'yellow',
  441. # 'recommendation': 'Acceptable but needs improvement. Review suggestions.'
  442. # }
  443. # elif score >= 60:
  444. # return {
  445. # 'grade': 'D',
  446. # 'status': 'Poor',
  447. # 'color': 'orange',
  448. # 'recommendation': 'Significant issues found. Requires immediate attention.'
  449. # }
  450. # else:
  451. # return {
  452. # 'grade': 'F',
  453. # 'status': 'Critical',
  454. # 'color': 'red',
  455. # 'recommendation': 'Critical quality issues. Major revision needed.'
  456. # }
  457. # attribute_scorer_integrated.py
  458. """
  459. Enhanced AttributeQualityScorer with ProductContentRule integration
  460. Consistent breakdown and component status for unavailable scores
  461. """
  462. import re
  463. import time
  464. import logging
  465. from typing import Dict, List, Tuple
  466. from rapidfuzz import fuzz, process
  467. logger = logging.getLogger(__name__)
  468. class AttributeQualityScorer:
  469. """
  470. Complete Product Quality Scorer with ALL components INCLUDING ProductContentRule validation:
  471. - Mandatory Fields (16%)
  472. - Standardization (12%)
  473. - Missing Values (8%)
  474. - Consistency (4%)
  475. - SEO Discoverability (8%)
  476. - Content Rules Compliance (12%) ← NEW
  477. - Title Quality (8%)
  478. - Description Quality (12%)
  479. - Image Quality (20%)
  480. """
  481. def __init__(self, use_ai: bool = True, use_seo: bool = True):
  482. # Load spaCy model (optional, for advanced NER)
  483. self.nlp = None
  484. try:
  485. import spacy
  486. self.nlp = spacy.load("en_core_web_sm")
  487. logger.info("spaCy model loaded successfully")
  488. except Exception as e:
  489. logger.warning(f"spaCy model not loaded: {e}. Will use fallback methods.")
  490. # Initialize AI service
  491. self.use_ai = use_ai
  492. self.ai_service = None
  493. if use_ai:
  494. try:
  495. from .gemini_service import GeminiAttributeService
  496. self.ai_service = GeminiAttributeService()
  497. logger.info("Gemini AI service initialized")
  498. except Exception as e:
  499. logger.warning(f"Gemini service not available: {e}")
  500. self.use_ai = False
  501. # Initialize Image Scorer
  502. self.image_scorer = None
  503. try:
  504. from .image_scorer import ImageQualityScorer
  505. self.image_scorer = ImageQualityScorer()
  506. logger.info("Image scorer initialized")
  507. except Exception as e:
  508. logger.warning(f"Image scorer not available: {e}")
  509. # Initialize SEO scorer
  510. self.use_seo = use_seo
  511. self.seo_scorer = None
  512. if use_seo:
  513. try:
  514. from .seo_scorer import SEODiscoverabilityScorer
  515. self.seo_scorer = SEODiscoverabilityScorer()
  516. logger.info("SEO scorer initialized")
  517. except Exception as e:
  518. logger.warning(f"SEO scorer not available: {e}")
  519. self.use_seo = False
  520. # Initialize Title/Description scorer
  521. self.title_desc_scorer = None
  522. try:
  523. from .title_description_scorer import TitleDescriptionScorer
  524. self.title_desc_scorer = TitleDescriptionScorer(use_ai=use_ai)
  525. logger.info("Title/Description scorer initialized")
  526. except Exception as e:
  527. logger.warning(f"Title/Description scorer not available: {e}")
  528. # Initialize Content Rules scorer ← NEW
  529. self.content_rules_scorer = None
  530. try:
  531. from .content_rules_scorer import ContentRulesScorer
  532. self.content_rules_scorer = ContentRulesScorer()
  533. logger.info("Content Rules scorer initialized")
  534. except Exception as e:
  535. logger.warning(f"Content Rules scorer not available: {e}")
  536. # UPDATED WEIGHTS (Total = 100%)
  537. self.weights = {
  538. 'mandatory_fields': 0.16,
  539. 'standardization': 0.12,
  540. 'missing_values': 0.08,
  541. 'consistency': 0.04,
  542. 'seo_discoverability': 0.08,
  543. 'content_rules_compliance': 0.12,
  544. 'title_quality': 0.08,
  545. 'description_quality': 0.12,
  546. 'image_quality': 0.20
  547. }
  548. def score_product(
  549. self,
  550. product: Dict,
  551. category_rules: List[Dict],
  552. content_rules: List[Dict] = None,
  553. generate_ai_suggestions: bool = True
  554. ) -> Dict:
  555. """
  556. Complete product scoring with all components including ProductContentRule validation
  557. and consistent breakdown
  558. """
  559. start_time = time.time()
  560. attributes = product.get('attributes', {})
  561. category = product.get('category', '')
  562. # Initialize scores and status
  563. scores = {k: None for k in self.weights}
  564. component_status = {k: "Not evaluated yet" for k in self.weights}
  565. issues = []
  566. suggestions = []
  567. # --- 1. Mandatory Fields ---
  568. try:
  569. score, comp_issues, comp_suggestions = self._check_mandatory_fields(attributes, category_rules)
  570. scores['mandatory_fields'] = score
  571. issues.extend(comp_issues)
  572. suggestions.extend(comp_suggestions)
  573. component_status['mandatory_fields'] = "Scored successfully"
  574. except Exception as e:
  575. logger.error(f"[Mandatory Fields] {e}", exc_info=True)
  576. component_status['mandatory_fields'] = f"Error: {str(e)}"
  577. # --- 2. Standardization ---
  578. try:
  579. score, comp_issues, comp_suggestions = self._check_standardization(attributes, category_rules)
  580. scores['standardization'] = score
  581. issues.extend(comp_issues)
  582. suggestions.extend(comp_suggestions)
  583. component_status['standardization'] = "Scored successfully"
  584. except Exception as e:
  585. logger.error(f"[Standardization] {e}", exc_info=True)
  586. component_status['standardization'] = f"Error: {str(e)}"
  587. # --- 3. Missing Values ---
  588. try:
  589. score, comp_issues, comp_suggestions = self._check_missing_values(attributes, category_rules)
  590. scores['missing_values'] = score
  591. issues.extend(comp_issues)
  592. suggestions.extend(comp_suggestions)
  593. component_status['missing_values'] = "Scored successfully"
  594. except Exception as e:
  595. logger.error(f"[Missing Values] {e}", exc_info=True)
  596. component_status['missing_values'] = f"Error: {str(e)}"
  597. # --- 4. Consistency ---
  598. try:
  599. score, comp_issues = self._check_consistency(attributes, product.get('title', ''), product.get('description', ''))
  600. scores['consistency'] = score
  601. issues.extend(comp_issues)
  602. component_status['consistency'] = "Scored successfully"
  603. except Exception as e:
  604. logger.error(f"[Consistency] {e}", exc_info=True)
  605. component_status['consistency'] = f"Error: {str(e)}"
  606. # --- 5. SEO Discoverability ---
  607. if self.use_seo and self.seo_scorer:
  608. try:
  609. seo_result = self.seo_scorer.score_seo(product, category_rules)
  610. scores['seo_discoverability'] = seo_result.get('seo_score', None)
  611. issues.extend(seo_result.get('issues', []))
  612. suggestions.extend(seo_result.get('suggestions', []))
  613. component_status['seo_discoverability'] = "Scored successfully"
  614. except Exception as e:
  615. logger.error(f"[SEO] {e}", exc_info=True)
  616. component_status['seo_discoverability'] = f"Error: {str(e)}"
  617. else:
  618. component_status['seo_discoverability'] = "SEO scorer not available"
  619. # --- 6. Content Rules Compliance ---
  620. if content_rules and self.content_rules_scorer:
  621. try:
  622. content_result = self.content_rules_scorer.score_content_fields(product, content_rules)
  623. scores['content_rules_compliance'] = content_result.get('overall_content_score', None)
  624. issues.extend(content_result.get('issues', []))
  625. suggestions.extend(content_result.get('suggestions', []))
  626. component_status['content_rules_compliance'] = "Scored successfully"
  627. except Exception as e:
  628. logger.error(f"[Content Rules] {e}", exc_info=True)
  629. component_status['content_rules_compliance'] = f"Error: {str(e)}"
  630. else:
  631. scores['content_rules_compliance'] = None
  632. component_status['content_rules_compliance'] = "No content rules provided"
  633. # --- 7. Title & Description ---
  634. if self.title_desc_scorer:
  635. try:
  636. td_result = self.title_desc_scorer.score_title_and_description(product, category_rules)
  637. scores['title_quality'] = td_result.get('title_score', None)
  638. scores['description_quality'] = td_result.get('description_score', None)
  639. issues.extend(td_result.get('issues', []))
  640. suggestions.extend(td_result.get('suggestions', []))
  641. component_status['title_quality'] = "Scored successfully"
  642. component_status['description_quality'] = "Scored successfully"
  643. except Exception as e:
  644. logger.error(f"[Title/Description] {e}", exc_info=True)
  645. component_status['title_quality'] = f"Error: {str(e)}"
  646. component_status['description_quality'] = f"Error: {str(e)}"
  647. else:
  648. component_status['title_quality'] = "Title/Description scorer not available"
  649. component_status['description_quality'] = "Title/Description scorer not available"
  650. # --- 8. Image Quality ---
  651. if self.image_scorer:
  652. try:
  653. images = product.get('images', [])
  654. if images:
  655. img_result = self.image_scorer.score_images(images)
  656. scores['image_quality'] = img_result.get("overall_image_score", None)
  657. component_status['image_quality'] = "Scored successfully"
  658. else:
  659. scores['image_quality'] = None
  660. component_status['image_quality'] = "No images provided"
  661. except Exception as e:
  662. logger.error(f"[Image] {e}", exc_info=True)
  663. scores['image_quality'] = None
  664. component_status['image_quality'] = f"Error: {str(e)}"
  665. else:
  666. scores['image_quality'] = None
  667. component_status['image_quality'] = "Image scorer not available"
  668. # --- Final Score Calculation: use only numeric scores ---
  669. numeric_scores = {k: v for k, v in scores.items() if isinstance(v, (int, float))}
  670. applicable_weights = {k: self.weights[k] for k in numeric_scores}
  671. total_weight = sum(applicable_weights.values())
  672. if numeric_scores and total_weight > 0:
  673. final_score = sum(numeric_scores[k] * (applicable_weights[k] / total_weight) for k in numeric_scores)
  674. else:
  675. final_score = None
  676. result = {
  677. 'final_score': round(final_score, 2) if final_score is not None else None,
  678. 'max_score': 100.0,
  679. 'breakdown': scores,
  680. 'component_status': component_status,
  681. 'issues': issues,
  682. 'suggestions': suggestions,
  683. 'weights': self.weights,
  684. 'processing_time': round(time.time() - start_time, 3)
  685. }
  686. # AI Suggestions
  687. if generate_ai_suggestions and self.use_ai and self.ai_service:
  688. try:
  689. ai_suggestions = self.ai_service.generate_comprehensive_suggestions(product, issues, category_rules, scores)
  690. result['ai_suggestions'] = ai_suggestions
  691. except Exception as e:
  692. logger.error(f"[AI Suggestions] {e}", exc_info=True)
  693. result['ai_suggestions'] = {'error': str(e)}
  694. return result
  695. # ================= ATTRIBUTE CHECK METHODS (unchanged) =================
  696. def _check_mandatory_fields(self, attributes: Dict, rules: List[Dict]) -> Tuple[float, List[str], List[str]]:
  697. mandatory_rules = [r for r in rules if r.get('is_mandatory', False)]
  698. if not mandatory_rules:
  699. return 100.0, [], []
  700. present_count, issues, suggestions = 0, [], []
  701. for rule in mandatory_rules:
  702. attr_name = rule['attribute_name']
  703. if attr_name in attributes and attributes[attr_name]:
  704. value = str(attributes[attr_name]).strip()
  705. if not value:
  706. issues.append(f"Mandatory field '{attr_name}' is empty")
  707. suggestions.append(f"Provide a non-empty value for {attr_name}")
  708. continue
  709. min_len = rule.get('min_length')
  710. max_len = rule.get('max_length')
  711. if min_len and len(value) < min_len:
  712. issues.append(f"'{attr_name}' too short (min: {min_len} chars)")
  713. suggestions.append(f"Expand {attr_name} to at least {min_len} characters")
  714. continue
  715. if max_len and len(value) > max_len:
  716. issues.append(f"'{attr_name}' too long (max: {max_len} chars)")
  717. suggestions.append(f"Shorten {attr_name} to {max_len} characters or less")
  718. continue
  719. regex = rule.get('validation_regex')
  720. if regex:
  721. try:
  722. if not re.match(regex, value):
  723. issues.append(f"'{attr_name}' format invalid")
  724. suggestions.append(f"Ensure {attr_name} matches required format")
  725. continue
  726. except re.error:
  727. logger.warning(f"Invalid regex pattern for {attr_name}: {regex}")
  728. present_count += 1
  729. else:
  730. issues.append(f"Missing mandatory field: {attr_name}")
  731. desc = rule.get('description', '')
  732. suggestions.append(f"Add {attr_name}: {desc}" if desc else f"Add required attribute: {attr_name}")
  733. score = (present_count / len(mandatory_rules)) * 100 if mandatory_rules else 100.0
  734. return score, issues, suggestions
  735. def _check_standardization(self, attributes: Dict, rules: List[Dict]) -> Tuple[float, List[str], List[str]]:
  736. standardized_rules = [r for r in rules if r.get('valid_values')]
  737. if not standardized_rules:
  738. return 100.0, [], []
  739. correct_count, issues, suggestions = 0, [], []
  740. for rule in standardized_rules:
  741. attr_name = rule['attribute_name']
  742. valid_values = rule['valid_values']
  743. if not valid_values or attr_name not in attributes or not attributes[attr_name]:
  744. continue
  745. actual_value = str(attributes[attr_name]).strip()
  746. if actual_value in valid_values:
  747. correct_count += 1
  748. continue
  749. lower_valid = {v.lower(): v for v in valid_values}
  750. if actual_value.lower() in lower_valid:
  751. correct_count += 1
  752. correct_value = lower_valid[actual_value.lower()]
  753. if actual_value != correct_value:
  754. issues.append(f"{attr_name}: Case mismatch - '{actual_value}' should be '{correct_value}'")
  755. suggestions.append(f"Correct capitalization of {attr_name} to: {correct_value}")
  756. continue
  757. best_match = process.extractOne(actual_value, valid_values, scorer=fuzz.ratio)
  758. if best_match and best_match[1] >= 80:
  759. correct_count += 1
  760. if best_match[1] < 100:
  761. issues.append(f"{attr_name}: '{actual_value}' likely means '{best_match[0]}' (confidence: {best_match[1]}%)")
  762. suggestions.append(f"Standardize {attr_name} to: {best_match[0]}")
  763. else:
  764. issues.append(f"{attr_name}: '{actual_value}' not recognized. Valid: {', '.join(valid_values[:3])}")
  765. suggestions.append(f"Change {attr_name} to one of: {', '.join(valid_values[:3])}")
  766. score = (correct_count / len(standardized_rules)) * 100 if standardized_rules else 100.0
  767. return score, issues, suggestions
  768. def _check_missing_values(self, attributes: Dict, rules: List[Dict]) -> Tuple[float, List[str], List[str]]:
  769. placeholder_patterns = [r'^n/?a$', r'^none$', r'^null$', r'^-+$', r'^\.+$', r'^tbd$', r'^to be determined$', r'^unknown$', r'^na$', r'^todo$', r'^pending$', r'^\?+$', r'^xxx+$', r'^placeholder$']
  770. total_attrs, valid_count, issues, suggestions = len(rules), 0, [], []
  771. for rule in rules:
  772. attr_name = rule['attribute_name']
  773. if attr_name not in attributes:
  774. continue
  775. value = str(attributes[attr_name]).strip()
  776. if not value:
  777. issues.append(f"'{attr_name}' is empty")
  778. suggestions.append(f"Provide a valid value for {attr_name}")
  779. continue
  780. value_lower = value.lower()
  781. is_placeholder = any(re.match(pattern, value_lower, re.IGNORECASE) for pattern in placeholder_patterns)
  782. if is_placeholder:
  783. issues.append(f"'{attr_name}' contains placeholder: '{value}'")
  784. suggestions.append(f"Replace placeholder in {attr_name} with actual data")
  785. continue
  786. if rule.get('is_mandatory') and len(value) < 2:
  787. issues.append(f"'{attr_name}' suspiciously short: '{value}'")
  788. suggestions.append(f"Provide more detailed {attr_name}")
  789. continue
  790. valid_count += 1
  791. score = (valid_count / total_attrs) * 100 if total_attrs > 0 else 100.0
  792. return score, issues, suggestions
  793. def _check_consistency(self, attributes: Dict, title: str, description: str) -> Tuple[float, List[str]]:
  794. issues, consistency_count, total_checks = [], 0, 0
  795. check_attrs = ['brand', 'color', 'size', 'material', 'model', 'weight', 'dimensions']
  796. combined_text = f"{title} {description}".lower()
  797. for attr in check_attrs:
  798. if attr not in attributes or not attributes[attr]:
  799. continue
  800. total_checks += 1
  801. attr_value = str(attributes[attr]).lower().strip()
  802. if len(attr_value) < 2 or attr_value in combined_text:
  803. consistency_count += 1
  804. continue
  805. words_in_attr = set(attr_value.split())
  806. words_in_text = set(combined_text.split())
  807. if words_in_attr.issubset(words_in_text):
  808. consistency_count += 1
  809. continue
  810. text_words = combined_text.split()
  811. if text_words:
  812. match = process.extractOne(attr_value, text_words, scorer=fuzz.ratio)
  813. if match and match[1] >= 80:
  814. consistency_count += 1
  815. continue
  816. issues.append(f"'{attr.title()}': '{attributes[attr]}' not mentioned in title/description")
  817. score = (consistency_count / total_checks) * 100 if total_checks > 0 else 100.0
  818. return score, issues