seo_scorer.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435
  1. # seo_scorer.py
  2. import re
  3. import logging
  4. from typing import Dict, List, Tuple, Set
  5. from collections import Counter
  6. import numpy as np
  7. logger = logging.getLogger(__name__)
  8. class SEODiscoverabilityScorer:
  9. """
  10. SEO & Discoverability scoring (15% weight)
  11. Checks: Keyword coverage, semantic richness, backend keywords
  12. """
  13. def __init__(self):
  14. self.keybert_model = None
  15. self.sentence_model = None
  16. self._initialize_models()
  17. # SEO scoring weights
  18. self.weights = {
  19. 'keyword_coverage': 0.35, # Are key attributes in title/description?
  20. 'semantic_richness': 0.30, # Descriptive quality & vocabulary diversity
  21. 'backend_keywords': 0.20, # Presence of searchable backend terms
  22. 'title_optimization': 0.15 # Title length, structure, readability
  23. }
  24. # Category-specific important keywords
  25. self.category_keywords = {
  26. 'Electronics': ['brand', 'model', 'warranty', 'condition', 'specs', 'features', 'technology'],
  27. 'Clothing': ['brand', 'size', 'color', 'material', 'fit', 'style', 'occasion', 'care'],
  28. 'Home & Garden': ['material', 'dimensions', 'color', 'style', 'brand', 'indoor', 'outdoor'],
  29. 'Sports': ['brand', 'size', 'sport', 'material', 'performance', 'level', 'gender']
  30. }
  31. # Common search terms users look for
  32. self.high_value_terms = {
  33. 'quality_indicators': ['premium', 'high-quality', 'durable', 'professional', 'authentic', 'genuine'],
  34. 'value_indicators': ['affordable', 'budget', 'value', 'economical', 'best', 'top-rated'],
  35. 'feature_terms': ['lightweight', 'waterproof', 'wireless', 'adjustable', 'portable', 'compact'],
  36. 'condition_terms': ['new', 'refurbished', 'used', 'like-new', 'open-box']
  37. }
  38. def _initialize_models(self):
  39. """Initialize NLP models with fallback handling"""
  40. try:
  41. from keybert import KeyBERT
  42. self.keybert_model = KeyBERT()
  43. logger.info("KeyBERT model loaded successfully")
  44. except Exception as e:
  45. logger.warning(f"KeyBERT not available: {e}. Using fallback keyword extraction.")
  46. self.keybert_model = None
  47. try:
  48. from sentence_transformers import SentenceTransformer
  49. self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
  50. logger.info("Sentence transformer model loaded successfully")
  51. except Exception as e:
  52. logger.warning(f"Sentence transformer not available: {e}. Using fallback semantic analysis.")
  53. self.sentence_model = None
  54. def score_seo(self, product: Dict, category_rules: List[Dict]) -> Dict:
  55. """
  56. Main SEO scoring function
  57. Returns: score breakdown, issues, and suggestions
  58. """
  59. try:
  60. title = product.get('title', '')
  61. description = product.get('description', '')
  62. category = product.get('category', '')
  63. attributes = product.get('attributes', {})
  64. scores = {}
  65. issues = []
  66. suggestions = []
  67. # 1. Keyword Coverage (35%)
  68. kw_score, kw_issues, kw_suggestions = self._check_keyword_coverage(
  69. title, description, attributes, category, category_rules
  70. )
  71. scores['keyword_coverage'] = kw_score
  72. issues.extend(kw_issues)
  73. suggestions.extend(kw_suggestions)
  74. # 2. Semantic Richness (30%)
  75. semantic_score, semantic_issues, semantic_suggestions = self._check_semantic_richness(
  76. title, description
  77. )
  78. scores['semantic_richness'] = semantic_score
  79. issues.extend(semantic_issues)
  80. suggestions.extend(semantic_suggestions)
  81. # 3. Backend Keywords (20%)
  82. backend_score, backend_issues, backend_suggestions = self._check_backend_keywords(
  83. title, description, attributes, category
  84. )
  85. scores['backend_keywords'] = backend_score
  86. issues.extend(backend_issues)
  87. suggestions.extend(backend_suggestions)
  88. # 4. Title Optimization (15%)
  89. title_score, title_issues, title_suggestions = self._check_title_optimization(
  90. title, attributes
  91. )
  92. scores['title_optimization'] = title_score
  93. issues.extend(title_issues)
  94. suggestions.extend(title_suggestions)
  95. # Calculate final SEO score
  96. final_score = sum(scores[key] * self.weights[key] for key in scores)
  97. return {
  98. 'seo_score': round(final_score, 2),
  99. 'breakdown': scores,
  100. 'issues': issues,
  101. 'suggestions': suggestions,
  102. 'extracted_keywords': self._extract_keywords(title, description),
  103. 'missing_high_value_terms': self._find_missing_high_value_terms(title, description, category)
  104. }
  105. except Exception as e:
  106. logger.error(f"SEO scoring error: {e}", exc_info=True)
  107. return {
  108. 'seo_score': 0.0,
  109. 'breakdown': {},
  110. 'issues': [f"SEO scoring failed: {str(e)}"],
  111. 'suggestions': []
  112. }
  113. def _check_keyword_coverage(
  114. self,
  115. title: str,
  116. description: str,
  117. attributes: Dict,
  118. category: str,
  119. rules: List[Dict]
  120. ) -> Tuple[float, List[str], List[str]]:
  121. """Check if key product attributes are mentioned in title/description"""
  122. issues = []
  123. suggestions = []
  124. combined_text = f"{title} {description}".lower()
  125. mandatory_attrs = [r['attribute_name'] for r in rules if r.get('is_mandatory')]
  126. covered_count = 0
  127. total_mandatory = len(mandatory_attrs)
  128. if total_mandatory == 0:
  129. return 100.0, [], []
  130. for attr_name in mandatory_attrs:
  131. attr_value = attributes.get(attr_name, '')
  132. if not attr_value:
  133. issues.append(f"SEO: Mandatory attribute '{attr_name}' is missing entirely")
  134. suggestions.append(f"Add {attr_name} to improve discoverability")
  135. continue
  136. attr_value_str = str(attr_value).lower()
  137. # Check if attribute value appears in title or description
  138. if attr_value_str in combined_text:
  139. covered_count += 1
  140. elif attr_name.lower() in combined_text:
  141. # Attribute name mentioned but not value
  142. covered_count += 0.5
  143. issues.append(f"SEO: '{attr_name}' mentioned but value '{attr_value}' not clearly stated")
  144. suggestions.append(f"Include specific {attr_name} '{attr_value}' in title or description")
  145. else:
  146. issues.append(f"SEO: Key attribute '{attr_name}: {attr_value}' not mentioned in title/description")
  147. suggestions.append(f"Add '{attr_name}: {attr_value}' to title or first line of description")
  148. score = (covered_count / total_mandatory) * 100 if total_mandatory > 0 else 100.0
  149. return score, issues, suggestions
  150. def _check_semantic_richness(
  151. self,
  152. title: str,
  153. description: str
  154. ) -> Tuple[float, List[str], List[str]]:
  155. """Evaluate descriptive quality and vocabulary diversity"""
  156. issues = []
  157. suggestions = []
  158. score_components = []
  159. # 1. Description length check
  160. desc_length = len(description.split())
  161. if desc_length < 20:
  162. issues.append(f"SEO: Description too short ({desc_length} words, recommended 50+)")
  163. suggestions.append("Expand description to 50-150 words for better SEO")
  164. length_score = (desc_length / 20) * 100
  165. elif desc_length > 300:
  166. issues.append(f"SEO: Description very long ({desc_length} words, may hurt readability)")
  167. suggestions.append("Consider condensing to 50-200 words for optimal engagement")
  168. length_score = 80.0
  169. else:
  170. length_score = 100.0
  171. score_components.append(length_score)
  172. # 2. Vocabulary diversity (unique words ratio)
  173. words = re.findall(r'\b\w+\b', description.lower())
  174. if words:
  175. unique_ratio = len(set(words)) / len(words)
  176. if unique_ratio < 0.5:
  177. issues.append("SEO: Low vocabulary diversity (repetitive text)")
  178. suggestions.append("Use more varied descriptive words to improve richness")
  179. diversity_score = unique_ratio * 100
  180. else:
  181. diversity_score = min(unique_ratio * 150, 100) # Cap at 100
  182. else:
  183. diversity_score = 0.0
  184. issues.append("SEO: Empty or very short description")
  185. suggestions.append("Add a detailed product description")
  186. score_components.append(diversity_score)
  187. # 3. Adjective/descriptive word presence
  188. descriptive_patterns = [
  189. r'\b(premium|quality|durable|lightweight|comfortable|stylish|modern|classic)\b',
  190. r'\b(professional|authentic|genuine|original|certified|official)\b',
  191. r'\b(innovative|advanced|smart|efficient|powerful|reliable)\b'
  192. ]
  193. descriptive_count = sum(len(re.findall(pattern, description.lower())) for pattern in descriptive_patterns)
  194. if descriptive_count == 0:
  195. issues.append("SEO: No descriptive/quality adjectives found")
  196. suggestions.append("Add descriptive words like 'premium', 'durable', 'comfortable' to enhance appeal")
  197. descriptive_score = 0.0
  198. elif descriptive_count < 3:
  199. suggestions.append("Consider adding more descriptive adjectives for better engagement")
  200. descriptive_score = (descriptive_count / 3) * 100
  201. else:
  202. descriptive_score = 100.0
  203. score_components.append(descriptive_score)
  204. # 4. Sentence structure (not just bullet points)
  205. sentences = re.split(r'[.!?]+', description)
  206. complete_sentences = [s for s in sentences if len(s.split()) >= 5]
  207. if len(complete_sentences) < 2:
  208. issues.append("SEO: Description lacks complete sentences (use prose, not just bullet points)")
  209. suggestions.append("Write 2-3 complete sentences describing the product")
  210. structure_score = (len(complete_sentences) / 2) * 100
  211. else:
  212. structure_score = 100.0
  213. score_components.append(structure_score)
  214. final_score = np.mean(score_components)
  215. return final_score, issues, suggestions
  216. def _check_backend_keywords(
  217. self,
  218. title: str,
  219. description: str,
  220. attributes: Dict,
  221. category: str
  222. ) -> Tuple[float, List[str], List[str]]:
  223. """Check for presence of searchable backend keywords"""
  224. issues = []
  225. suggestions = []
  226. combined_text = f"{title} {description}".lower()
  227. # Get category-specific keywords
  228. expected_keywords = self.category_keywords.get(category, [])
  229. present_count = 0
  230. for keyword in expected_keywords:
  231. if keyword in combined_text or keyword in str(attributes.values()).lower():
  232. present_count += 1
  233. else:
  234. issues.append(f"SEO: Missing common search term '{keyword}' for {category}")
  235. suggestions.append(f"Consider mentioning '{keyword}' if applicable to improve searchability")
  236. coverage_score = (present_count / len(expected_keywords)) * 100 if expected_keywords else 100.0
  237. # Check for high-value terms
  238. high_value_present = 0
  239. all_high_value = []
  240. for category_terms in self.high_value_terms.values():
  241. all_high_value.extend(category_terms)
  242. for term in all_high_value:
  243. if term in combined_text:
  244. high_value_present += 1
  245. if high_value_present == 0:
  246. issues.append("SEO: No high-value search terms found (e.g., 'premium', 'durable', 'best')")
  247. suggestions.append("Add 1-2 quality/value indicators to attract more searches")
  248. value_score = 0.0
  249. elif high_value_present < 2:
  250. suggestions.append("Consider adding more value-indicating terms for better positioning")
  251. value_score = (high_value_present / 2) * 100
  252. else:
  253. value_score = 100.0
  254. final_score = (coverage_score * 0.6 + value_score * 0.4)
  255. return final_score, issues, suggestions
  256. def _check_title_optimization(
  257. self,
  258. title: str,
  259. attributes: Dict
  260. ) -> Tuple[float, List[str], List[str]]:
  261. """Evaluate title quality for SEO"""
  262. issues = []
  263. suggestions = []
  264. score_components = []
  265. # 1. Title length (optimal: 50-100 characters)
  266. title_len = len(title)
  267. if title_len < 30:
  268. issues.append(f"SEO: Title too short ({title_len} chars, recommended 50-100)")
  269. suggestions.append("Expand title to include key attributes (brand, model, key features)")
  270. length_score = (title_len / 30) * 100
  271. elif title_len > 150:
  272. issues.append(f"SEO: Title too long ({title_len} chars, may be truncated in search)")
  273. suggestions.append("Shorten title to 50-100 characters, focus on key selling points")
  274. length_score = 70.0
  275. else:
  276. length_score = 100.0
  277. score_components.append(length_score)
  278. # 2. Key attributes in title
  279. key_attrs = ['brand', 'model', 'color', 'size']
  280. present_in_title = sum(1 for attr in key_attrs if attr in attributes and str(attributes[attr]).lower() in title.lower())
  281. if present_in_title < 2:
  282. issues.append("SEO: Title missing key attributes (brand, model, color, size)")
  283. suggestions.append("Include at least 2-3 key attributes in title")
  284. attr_score = (present_in_title / 2) * 100
  285. else:
  286. attr_score = 100.0
  287. score_components.append(attr_score)
  288. # 3. No keyword stuffing (repeated words)
  289. words = title.lower().split()
  290. word_counts = Counter(words)
  291. max_repetition = max(word_counts.values()) if word_counts else 0
  292. if max_repetition > 3:
  293. issues.append("SEO: Title has keyword stuffing (repeated words)")
  294. suggestions.append("Remove repeated keywords, make title natural and readable")
  295. stuffing_score = 50.0
  296. elif max_repetition > 2:
  297. suggestions.append("Reduce word repetition in title for better readability")
  298. stuffing_score = 75.0
  299. else:
  300. stuffing_score = 100.0
  301. score_components.append(stuffing_score)
  302. # 4. Capitalization (Title Case preferred)
  303. if title.isupper():
  304. issues.append("SEO: Title in ALL CAPS (reduces readability)")
  305. suggestions.append("Use Title Case for better readability")
  306. case_score = 50.0
  307. elif title.islower():
  308. issues.append("SEO: Title in lowercase (looks unprofessional)")
  309. suggestions.append("Use Title Case or Sentence case")
  310. case_score = 60.0
  311. else:
  312. case_score = 100.0
  313. score_components.append(case_score)
  314. final_score = np.mean(score_components)
  315. return final_score, issues, suggestions
  316. def _extract_keywords(self, title: str, description: str, top_n: int = 10) -> List[Dict]:
  317. """Extract top keywords using KeyBERT or fallback method"""
  318. combined_text = f"{title}. {description}"
  319. if self.keybert_model:
  320. try:
  321. keywords = self.keybert_model.extract_keywords(
  322. combined_text,
  323. keyphrase_ngram_range=(1, 2),
  324. stop_words='english',
  325. top_n=top_n
  326. )
  327. return [{'keyword': kw, 'score': round(score, 3)} for kw, score in keywords]
  328. except Exception as e:
  329. logger.warning(f"KeyBERT extraction failed: {e}, using fallback")
  330. # Fallback: simple word frequency
  331. words = re.findall(r'\b\w{4,}\b', combined_text.lower())
  332. word_freq = Counter(words).most_common(top_n)
  333. return [{'keyword': word, 'score': round(freq / len(words), 3)} for word, freq in word_freq]
  334. def _find_missing_high_value_terms(self, title: str, description: str, category: str) -> List[str]:
  335. """Identify missing high-value search terms that could improve discoverability"""
  336. combined_text = f"{title} {description}".lower()
  337. missing_terms = []
  338. for term_type, terms in self.high_value_terms.items():
  339. found = any(term in combined_text for term in terms)
  340. if not found and len(missing_terms) < 5:
  341. # Suggest one term from each category
  342. missing_terms.append(f"{term_type.replace('_', ' ')}: {terms[0]}")
  343. category_terms = self.category_keywords.get(category, [])
  344. for term in category_terms[:3]:
  345. if term not in combined_text and term not in missing_terms:
  346. missing_terms.append(f"category keyword: {term}")
  347. return missing_terms[:5] # Limit to 5 suggestions
  348. def generate_seo_report(self, product: Dict, seo_result: Dict) -> str:
  349. """Generate a human-readable SEO report"""
  350. report = []
  351. report.append(f"=== SEO Score: {seo_result['seo_score']}/100 ===\n")
  352. report.append("Score Breakdown:")
  353. for metric, score in seo_result['breakdown'].items():
  354. report.append(f" - {metric.replace('_', ' ').title()}: {score:.1f}/100")
  355. if seo_result['issues']:
  356. report.append("\nIssues Found:")
  357. for issue in seo_result['issues']:
  358. report.append(f" • {issue}")
  359. if seo_result['suggestions']:
  360. report.append("\nSuggestions:")
  361. for suggestion in seo_result['suggestions']:
  362. report.append(f" ✓ {suggestion}")
  363. if seo_result.get('extracted_keywords'):
  364. report.append("\nTop Keywords:")
  365. for kw in seo_result['extracted_keywords'][:5]:
  366. report.append(f" - {kw['keyword']} (score: {kw['score']})")
  367. if seo_result.get('missing_high_value_terms'):
  368. report.append("\nMissing High-Value Terms:")
  369. for term in seo_result['missing_high_value_terms']:
  370. report.append(f" + {term}")
  371. return "\n".join(report)