# seo_scorer.py import re import logging from typing import Dict, List, Tuple, Set from collections import Counter import numpy as np logger = logging.getLogger(__name__) class SEODiscoverabilityScorer: """ SEO & Discoverability scoring (15% weight) Checks: Keyword coverage, semantic richness, backend keywords """ def __init__(self): self.keybert_model = None self.sentence_model = None self._initialize_models() # SEO scoring weights self.weights = { 'keyword_coverage': 0.35, # Are key attributes in title/description? 'semantic_richness': 0.30, # Descriptive quality & vocabulary diversity 'backend_keywords': 0.20, # Presence of searchable backend terms 'title_optimization': 0.15 # Title length, structure, readability } # Category-specific important keywords self.category_keywords = { 'Electronics': ['brand', 'model', 'warranty', 'condition', 'specs', 'features', 'technology'], 'Clothing': ['brand', 'size', 'color', 'material', 'fit', 'style', 'occasion', 'care'], 'Home & Garden': ['material', 'dimensions', 'color', 'style', 'brand', 'indoor', 'outdoor'], 'Sports': ['brand', 'size', 'sport', 'material', 'performance', 'level', 'gender'] } # Common search terms users look for self.high_value_terms = { 'quality_indicators': ['premium', 'high-quality', 'durable', 'professional', 'authentic', 'genuine'], 'value_indicators': ['affordable', 'budget', 'value', 'economical', 'best', 'top-rated'], 'feature_terms': ['lightweight', 'waterproof', 'wireless', 'adjustable', 'portable', 'compact'], 'condition_terms': ['new', 'refurbished', 'used', 'like-new', 'open-box'] } def _initialize_models(self): """Initialize NLP models with fallback handling""" try: from keybert import KeyBERT self.keybert_model = KeyBERT() logger.info("KeyBERT model loaded successfully") except Exception as e: logger.warning(f"KeyBERT not available: {e}. Using fallback keyword extraction.") self.keybert_model = None try: from sentence_transformers import SentenceTransformer self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2') logger.info("Sentence transformer model loaded successfully") except Exception as e: logger.warning(f"Sentence transformer not available: {e}. Using fallback semantic analysis.") self.sentence_model = None def score_seo(self, product: Dict, category_rules: List[Dict]) -> Dict: """ Main SEO scoring function Returns: score breakdown, issues, and suggestions """ try: title = product.get('title', '') description = product.get('description', '') category = product.get('category', '') attributes = product.get('attributes', {}) scores = {} issues = [] suggestions = [] # 1. Keyword Coverage (35%) kw_score, kw_issues, kw_suggestions = self._check_keyword_coverage( title, description, attributes, category, category_rules ) scores['keyword_coverage'] = kw_score issues.extend(kw_issues) suggestions.extend(kw_suggestions) # 2. Semantic Richness (30%) semantic_score, semantic_issues, semantic_suggestions = self._check_semantic_richness( title, description ) scores['semantic_richness'] = semantic_score issues.extend(semantic_issues) suggestions.extend(semantic_suggestions) # 3. Backend Keywords (20%) backend_score, backend_issues, backend_suggestions = self._check_backend_keywords( title, description, attributes, category ) scores['backend_keywords'] = backend_score issues.extend(backend_issues) suggestions.extend(backend_suggestions) # 4. Title Optimization (15%) title_score, title_issues, title_suggestions = self._check_title_optimization( title, attributes ) scores['title_optimization'] = title_score issues.extend(title_issues) suggestions.extend(title_suggestions) # Calculate final SEO score final_score = sum(scores[key] * self.weights[key] for key in scores) return { 'seo_score': round(final_score, 2), 'breakdown': scores, 'issues': issues, 'suggestions': suggestions, 'extracted_keywords': self._extract_keywords(title, description), 'missing_high_value_terms': self._find_missing_high_value_terms(title, description, category) } except Exception as e: logger.error(f"SEO scoring error: {e}", exc_info=True) return { 'seo_score': 0.0, 'breakdown': {}, 'issues': [f"SEO scoring failed: {str(e)}"], 'suggestions': [] } def _check_keyword_coverage( self, title: str, description: str, attributes: Dict, category: str, rules: List[Dict] ) -> Tuple[float, List[str], List[str]]: """Check if key product attributes are mentioned in title/description""" issues = [] suggestions = [] combined_text = f"{title} {description}".lower() mandatory_attrs = [r['attribute_name'] for r in rules if r.get('is_mandatory')] covered_count = 0 total_mandatory = len(mandatory_attrs) if total_mandatory == 0: return 100.0, [], [] for attr_name in mandatory_attrs: attr_value = attributes.get(attr_name, '') if not attr_value: issues.append(f"SEO: Mandatory attribute '{attr_name}' is missing entirely") suggestions.append(f"Add {attr_name} to improve discoverability") continue attr_value_str = str(attr_value).lower() # Check if attribute value appears in title or description if attr_value_str in combined_text: covered_count += 1 elif attr_name.lower() in combined_text: # Attribute name mentioned but not value covered_count += 0.5 issues.append(f"SEO: '{attr_name}' mentioned but value '{attr_value}' not clearly stated") suggestions.append(f"Include specific {attr_name} '{attr_value}' in title or description") else: issues.append(f"SEO: Key attribute '{attr_name}: {attr_value}' not mentioned in title/description") suggestions.append(f"Add '{attr_name}: {attr_value}' to title or first line of description") score = (covered_count / total_mandatory) * 100 if total_mandatory > 0 else 100.0 return score, issues, suggestions def _check_semantic_richness( self, title: str, description: str ) -> Tuple[float, List[str], List[str]]: """Evaluate descriptive quality and vocabulary diversity""" issues = [] suggestions = [] score_components = [] # 1. Description length check desc_length = len(description.split()) if desc_length < 20: issues.append(f"SEO: Description too short ({desc_length} words, recommended 50+)") suggestions.append("Expand description to 50-150 words for better SEO") length_score = (desc_length / 20) * 100 elif desc_length > 300: issues.append(f"SEO: Description very long ({desc_length} words, may hurt readability)") suggestions.append("Consider condensing to 50-200 words for optimal engagement") length_score = 80.0 else: length_score = 100.0 score_components.append(length_score) # 2. Vocabulary diversity (unique words ratio) words = re.findall(r'\b\w+\b', description.lower()) if words: unique_ratio = len(set(words)) / len(words) if unique_ratio < 0.5: issues.append("SEO: Low vocabulary diversity (repetitive text)") suggestions.append("Use more varied descriptive words to improve richness") diversity_score = unique_ratio * 100 else: diversity_score = min(unique_ratio * 150, 100) # Cap at 100 else: diversity_score = 0.0 issues.append("SEO: Empty or very short description") suggestions.append("Add a detailed product description") score_components.append(diversity_score) # 3. Adjective/descriptive word presence descriptive_patterns = [ r'\b(premium|quality|durable|lightweight|comfortable|stylish|modern|classic)\b', r'\b(professional|authentic|genuine|original|certified|official)\b', r'\b(innovative|advanced|smart|efficient|powerful|reliable)\b' ] descriptive_count = sum(len(re.findall(pattern, description.lower())) for pattern in descriptive_patterns) if descriptive_count == 0: issues.append("SEO: No descriptive/quality adjectives found") suggestions.append("Add descriptive words like 'premium', 'durable', 'comfortable' to enhance appeal") descriptive_score = 0.0 elif descriptive_count < 3: suggestions.append("Consider adding more descriptive adjectives for better engagement") descriptive_score = (descriptive_count / 3) * 100 else: descriptive_score = 100.0 score_components.append(descriptive_score) # 4. Sentence structure (not just bullet points) sentences = re.split(r'[.!?]+', description) complete_sentences = [s for s in sentences if len(s.split()) >= 5] if len(complete_sentences) < 2: issues.append("SEO: Description lacks complete sentences (use prose, not just bullet points)") suggestions.append("Write 2-3 complete sentences describing the product") structure_score = (len(complete_sentences) / 2) * 100 else: structure_score = 100.0 score_components.append(structure_score) final_score = np.mean(score_components) return final_score, issues, suggestions def _check_backend_keywords( self, title: str, description: str, attributes: Dict, category: str ) -> Tuple[float, List[str], List[str]]: """Check for presence of searchable backend keywords""" issues = [] suggestions = [] combined_text = f"{title} {description}".lower() # Get category-specific keywords expected_keywords = self.category_keywords.get(category, []) present_count = 0 for keyword in expected_keywords: if keyword in combined_text or keyword in str(attributes.values()).lower(): present_count += 1 else: issues.append(f"SEO: Missing common search term '{keyword}' for {category}") suggestions.append(f"Consider mentioning '{keyword}' if applicable to improve searchability") coverage_score = (present_count / len(expected_keywords)) * 100 if expected_keywords else 100.0 # Check for high-value terms high_value_present = 0 all_high_value = [] for category_terms in self.high_value_terms.values(): all_high_value.extend(category_terms) for term in all_high_value: if term in combined_text: high_value_present += 1 if high_value_present == 0: issues.append("SEO: No high-value search terms found (e.g., 'premium', 'durable', 'best')") suggestions.append("Add 1-2 quality/value indicators to attract more searches") value_score = 0.0 elif high_value_present < 2: suggestions.append("Consider adding more value-indicating terms for better positioning") value_score = (high_value_present / 2) * 100 else: value_score = 100.0 final_score = (coverage_score * 0.6 + value_score * 0.4) return final_score, issues, suggestions def _check_title_optimization( self, title: str, attributes: Dict ) -> Tuple[float, List[str], List[str]]: """Evaluate title quality for SEO""" issues = [] suggestions = [] score_components = [] # 1. Title length (optimal: 50-100 characters) title_len = len(title) if title_len < 30: issues.append(f"SEO: Title too short ({title_len} chars, recommended 50-100)") suggestions.append("Expand title to include key attributes (brand, model, key features)") length_score = (title_len / 30) * 100 elif title_len > 150: issues.append(f"SEO: Title too long ({title_len} chars, may be truncated in search)") suggestions.append("Shorten title to 50-100 characters, focus on key selling points") length_score = 70.0 else: length_score = 100.0 score_components.append(length_score) # 2. Key attributes in title key_attrs = ['brand', 'model', 'color', 'size'] present_in_title = sum(1 for attr in key_attrs if attr in attributes and str(attributes[attr]).lower() in title.lower()) if present_in_title < 2: issues.append("SEO: Title missing key attributes (brand, model, color, size)") suggestions.append("Include at least 2-3 key attributes in title") attr_score = (present_in_title / 2) * 100 else: attr_score = 100.0 score_components.append(attr_score) # 3. No keyword stuffing (repeated words) words = title.lower().split() word_counts = Counter(words) max_repetition = max(word_counts.values()) if word_counts else 0 if max_repetition > 3: issues.append("SEO: Title has keyword stuffing (repeated words)") suggestions.append("Remove repeated keywords, make title natural and readable") stuffing_score = 50.0 elif max_repetition > 2: suggestions.append("Reduce word repetition in title for better readability") stuffing_score = 75.0 else: stuffing_score = 100.0 score_components.append(stuffing_score) # 4. Capitalization (Title Case preferred) if title.isupper(): issues.append("SEO: Title in ALL CAPS (reduces readability)") suggestions.append("Use Title Case for better readability") case_score = 50.0 elif title.islower(): issues.append("SEO: Title in lowercase (looks unprofessional)") suggestions.append("Use Title Case or Sentence case") case_score = 60.0 else: case_score = 100.0 score_components.append(case_score) final_score = np.mean(score_components) return final_score, issues, suggestions def _extract_keywords(self, title: str, description: str, top_n: int = 10) -> List[Dict]: """Extract top keywords using KeyBERT or fallback method""" combined_text = f"{title}. {description}" if self.keybert_model: try: keywords = self.keybert_model.extract_keywords( combined_text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=top_n ) return [{'keyword': kw, 'score': round(score, 3)} for kw, score in keywords] except Exception as e: logger.warning(f"KeyBERT extraction failed: {e}, using fallback") # Fallback: simple word frequency words = re.findall(r'\b\w{4,}\b', combined_text.lower()) word_freq = Counter(words).most_common(top_n) return [{'keyword': word, 'score': round(freq / len(words), 3)} for word, freq in word_freq] def _find_missing_high_value_terms(self, title: str, description: str, category: str) -> List[str]: """Identify missing high-value search terms that could improve discoverability""" combined_text = f"{title} {description}".lower() missing_terms = [] for term_type, terms in self.high_value_terms.items(): found = any(term in combined_text for term in terms) if not found and len(missing_terms) < 5: # Suggest one term from each category missing_terms.append(f"{term_type.replace('_', ' ')}: {terms[0]}") category_terms = self.category_keywords.get(category, []) for term in category_terms[:3]: if term not in combined_text and term not in missing_terms: missing_terms.append(f"category keyword: {term}") return missing_terms[:5] # Limit to 5 suggestions def generate_seo_report(self, product: Dict, seo_result: Dict) -> str: """Generate a human-readable SEO report""" report = [] report.append(f"=== SEO Score: {seo_result['seo_score']}/100 ===\n") report.append("Score Breakdown:") for metric, score in seo_result['breakdown'].items(): report.append(f" - {metric.replace('_', ' ').title()}: {score:.1f}/100") if seo_result['issues']: report.append("\nIssues Found:") for issue in seo_result['issues']: report.append(f" • {issue}") if seo_result['suggestions']: report.append("\nSuggestions:") for suggestion in seo_result['suggestions']: report.append(f" ✓ {suggestion}") if seo_result.get('extracted_keywords'): report.append("\nTop Keywords:") for kw in seo_result['extracted_keywords'][:5]: report.append(f" - {kw['keyword']} (score: {kw['score']})") if seo_result.get('missing_high_value_terms'): report.append("\nMissing High-Value Terms:") for term in seo_result['missing_high_value_terms']: report.append(f" + {term}") return "\n".join(report)