123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435 |
- # seo_scorer.py
- import re
- import logging
- from typing import Dict, List, Tuple, Set
- from collections import Counter
- import numpy as np
- logger = logging.getLogger(__name__)
- class SEODiscoverabilityScorer:
- """
- SEO & Discoverability scoring (15% weight)
- Checks: Keyword coverage, semantic richness, backend keywords
- """
-
- def __init__(self):
- self.keybert_model = None
- self.sentence_model = None
- self._initialize_models()
-
- # SEO scoring weights
- self.weights = {
- 'keyword_coverage': 0.35, # Are key attributes in title/description?
- 'semantic_richness': 0.30, # Descriptive quality & vocabulary diversity
- 'backend_keywords': 0.20, # Presence of searchable backend terms
- 'title_optimization': 0.15 # Title length, structure, readability
- }
-
- # Category-specific important keywords
- self.category_keywords = {
- 'Electronics': ['brand', 'model', 'warranty', 'condition', 'specs', 'features', 'technology'],
- 'Clothing': ['brand', 'size', 'color', 'material', 'fit', 'style', 'occasion', 'care'],
- 'Home & Garden': ['material', 'dimensions', 'color', 'style', 'brand', 'indoor', 'outdoor'],
- 'Sports': ['brand', 'size', 'sport', 'material', 'performance', 'level', 'gender']
- }
-
- # Common search terms users look for
- self.high_value_terms = {
- 'quality_indicators': ['premium', 'high-quality', 'durable', 'professional', 'authentic', 'genuine'],
- 'value_indicators': ['affordable', 'budget', 'value', 'economical', 'best', 'top-rated'],
- 'feature_terms': ['lightweight', 'waterproof', 'wireless', 'adjustable', 'portable', 'compact'],
- 'condition_terms': ['new', 'refurbished', 'used', 'like-new', 'open-box']
- }
-
- def _initialize_models(self):
- """Initialize NLP models with fallback handling"""
- try:
- from keybert import KeyBERT
- self.keybert_model = KeyBERT()
- logger.info("KeyBERT model loaded successfully")
- except Exception as e:
- logger.warning(f"KeyBERT not available: {e}. Using fallback keyword extraction.")
- self.keybert_model = None
-
- try:
- from sentence_transformers import SentenceTransformer
- self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
- logger.info("Sentence transformer model loaded successfully")
- except Exception as e:
- logger.warning(f"Sentence transformer not available: {e}. Using fallback semantic analysis.")
- self.sentence_model = None
-
- def score_seo(self, product: Dict, category_rules: List[Dict]) -> Dict:
- """
- Main SEO scoring function
- Returns: score breakdown, issues, and suggestions
- """
- try:
- title = product.get('title', '')
- description = product.get('description', '')
- category = product.get('category', '')
- attributes = product.get('attributes', {})
-
- scores = {}
- issues = []
- suggestions = []
-
- # 1. Keyword Coverage (35%)
- kw_score, kw_issues, kw_suggestions = self._check_keyword_coverage(
- title, description, attributes, category, category_rules
- )
- scores['keyword_coverage'] = kw_score
- issues.extend(kw_issues)
- suggestions.extend(kw_suggestions)
-
- # 2. Semantic Richness (30%)
- semantic_score, semantic_issues, semantic_suggestions = self._check_semantic_richness(
- title, description
- )
- scores['semantic_richness'] = semantic_score
- issues.extend(semantic_issues)
- suggestions.extend(semantic_suggestions)
-
- # 3. Backend Keywords (20%)
- backend_score, backend_issues, backend_suggestions = self._check_backend_keywords(
- title, description, attributes, category
- )
- scores['backend_keywords'] = backend_score
- issues.extend(backend_issues)
- suggestions.extend(backend_suggestions)
-
- # 4. Title Optimization (15%)
- title_score, title_issues, title_suggestions = self._check_title_optimization(
- title, attributes
- )
- scores['title_optimization'] = title_score
- issues.extend(title_issues)
- suggestions.extend(title_suggestions)
-
- # Calculate final SEO score
- final_score = sum(scores[key] * self.weights[key] for key in scores)
-
- return {
- 'seo_score': round(final_score, 2),
- 'breakdown': scores,
- 'issues': issues,
- 'suggestions': suggestions,
- 'extracted_keywords': self._extract_keywords(title, description),
- 'missing_high_value_terms': self._find_missing_high_value_terms(title, description, category)
- }
-
- except Exception as e:
- logger.error(f"SEO scoring error: {e}", exc_info=True)
- return {
- 'seo_score': 0.0,
- 'breakdown': {},
- 'issues': [f"SEO scoring failed: {str(e)}"],
- 'suggestions': []
- }
-
- def _check_keyword_coverage(
- self,
- title: str,
- description: str,
- attributes: Dict,
- category: str,
- rules: List[Dict]
- ) -> Tuple[float, List[str], List[str]]:
- """Check if key product attributes are mentioned in title/description"""
- issues = []
- suggestions = []
-
- combined_text = f"{title} {description}".lower()
- mandatory_attrs = [r['attribute_name'] for r in rules if r.get('is_mandatory')]
-
- covered_count = 0
- total_mandatory = len(mandatory_attrs)
-
- if total_mandatory == 0:
- return 100.0, [], []
-
- for attr_name in mandatory_attrs:
- attr_value = attributes.get(attr_name, '')
-
- if not attr_value:
- issues.append(f"SEO: Mandatory attribute '{attr_name}' is missing entirely")
- suggestions.append(f"Add {attr_name} to improve discoverability")
- continue
-
- attr_value_str = str(attr_value).lower()
-
- # Check if attribute value appears in title or description
- if attr_value_str in combined_text:
- covered_count += 1
- elif attr_name.lower() in combined_text:
- # Attribute name mentioned but not value
- covered_count += 0.5
- issues.append(f"SEO: '{attr_name}' mentioned but value '{attr_value}' not clearly stated")
- suggestions.append(f"Include specific {attr_name} '{attr_value}' in title or description")
- else:
- issues.append(f"SEO: Key attribute '{attr_name}: {attr_value}' not mentioned in title/description")
- suggestions.append(f"Add '{attr_name}: {attr_value}' to title or first line of description")
-
- score = (covered_count / total_mandatory) * 100 if total_mandatory > 0 else 100.0
- return score, issues, suggestions
-
- def _check_semantic_richness(
- self,
- title: str,
- description: str
- ) -> Tuple[float, List[str], List[str]]:
- """Evaluate descriptive quality and vocabulary diversity"""
- issues = []
- suggestions = []
- score_components = []
-
- # 1. Description length check
- desc_length = len(description.split())
- if desc_length < 20:
- issues.append(f"SEO: Description too short ({desc_length} words, recommended 50+)")
- suggestions.append("Expand description to 50-150 words for better SEO")
- length_score = (desc_length / 20) * 100
- elif desc_length > 300:
- issues.append(f"SEO: Description very long ({desc_length} words, may hurt readability)")
- suggestions.append("Consider condensing to 50-200 words for optimal engagement")
- length_score = 80.0
- else:
- length_score = 100.0
- score_components.append(length_score)
-
- # 2. Vocabulary diversity (unique words ratio)
- words = re.findall(r'\b\w+\b', description.lower())
- if words:
- unique_ratio = len(set(words)) / len(words)
- if unique_ratio < 0.5:
- issues.append("SEO: Low vocabulary diversity (repetitive text)")
- suggestions.append("Use more varied descriptive words to improve richness")
- diversity_score = unique_ratio * 100
- else:
- diversity_score = min(unique_ratio * 150, 100) # Cap at 100
- else:
- diversity_score = 0.0
- issues.append("SEO: Empty or very short description")
- suggestions.append("Add a detailed product description")
- score_components.append(diversity_score)
-
- # 3. Adjective/descriptive word presence
- descriptive_patterns = [
- r'\b(premium|quality|durable|lightweight|comfortable|stylish|modern|classic)\b',
- r'\b(professional|authentic|genuine|original|certified|official)\b',
- r'\b(innovative|advanced|smart|efficient|powerful|reliable)\b'
- ]
- descriptive_count = sum(len(re.findall(pattern, description.lower())) for pattern in descriptive_patterns)
-
- if descriptive_count == 0:
- issues.append("SEO: No descriptive/quality adjectives found")
- suggestions.append("Add descriptive words like 'premium', 'durable', 'comfortable' to enhance appeal")
- descriptive_score = 0.0
- elif descriptive_count < 3:
- suggestions.append("Consider adding more descriptive adjectives for better engagement")
- descriptive_score = (descriptive_count / 3) * 100
- else:
- descriptive_score = 100.0
- score_components.append(descriptive_score)
-
- # 4. Sentence structure (not just bullet points)
- sentences = re.split(r'[.!?]+', description)
- complete_sentences = [s for s in sentences if len(s.split()) >= 5]
- if len(complete_sentences) < 2:
- issues.append("SEO: Description lacks complete sentences (use prose, not just bullet points)")
- suggestions.append("Write 2-3 complete sentences describing the product")
- structure_score = (len(complete_sentences) / 2) * 100
- else:
- structure_score = 100.0
- score_components.append(structure_score)
-
- final_score = np.mean(score_components)
- return final_score, issues, suggestions
-
- def _check_backend_keywords(
- self,
- title: str,
- description: str,
- attributes: Dict,
- category: str
- ) -> Tuple[float, List[str], List[str]]:
- """Check for presence of searchable backend keywords"""
- issues = []
- suggestions = []
-
- combined_text = f"{title} {description}".lower()
-
- # Get category-specific keywords
- expected_keywords = self.category_keywords.get(category, [])
-
- present_count = 0
- for keyword in expected_keywords:
- if keyword in combined_text or keyword in str(attributes.values()).lower():
- present_count += 1
- else:
- issues.append(f"SEO: Missing common search term '{keyword}' for {category}")
- suggestions.append(f"Consider mentioning '{keyword}' if applicable to improve searchability")
-
- coverage_score = (present_count / len(expected_keywords)) * 100 if expected_keywords else 100.0
-
- # Check for high-value terms
- high_value_present = 0
- all_high_value = []
- for category_terms in self.high_value_terms.values():
- all_high_value.extend(category_terms)
-
- for term in all_high_value:
- if term in combined_text:
- high_value_present += 1
-
- if high_value_present == 0:
- issues.append("SEO: No high-value search terms found (e.g., 'premium', 'durable', 'best')")
- suggestions.append("Add 1-2 quality/value indicators to attract more searches")
- value_score = 0.0
- elif high_value_present < 2:
- suggestions.append("Consider adding more value-indicating terms for better positioning")
- value_score = (high_value_present / 2) * 100
- else:
- value_score = 100.0
-
- final_score = (coverage_score * 0.6 + value_score * 0.4)
- return final_score, issues, suggestions
-
- def _check_title_optimization(
- self,
- title: str,
- attributes: Dict
- ) -> Tuple[float, List[str], List[str]]:
- """Evaluate title quality for SEO"""
- issues = []
- suggestions = []
- score_components = []
-
- # 1. Title length (optimal: 50-100 characters)
- title_len = len(title)
- if title_len < 30:
- issues.append(f"SEO: Title too short ({title_len} chars, recommended 50-100)")
- suggestions.append("Expand title to include key attributes (brand, model, key features)")
- length_score = (title_len / 30) * 100
- elif title_len > 150:
- issues.append(f"SEO: Title too long ({title_len} chars, may be truncated in search)")
- suggestions.append("Shorten title to 50-100 characters, focus on key selling points")
- length_score = 70.0
- else:
- length_score = 100.0
- score_components.append(length_score)
-
- # 2. Key attributes in title
- key_attrs = ['brand', 'model', 'color', 'size']
- present_in_title = sum(1 for attr in key_attrs if attr in attributes and str(attributes[attr]).lower() in title.lower())
-
- if present_in_title < 2:
- issues.append("SEO: Title missing key attributes (brand, model, color, size)")
- suggestions.append("Include at least 2-3 key attributes in title")
- attr_score = (present_in_title / 2) * 100
- else:
- attr_score = 100.0
- score_components.append(attr_score)
-
- # 3. No keyword stuffing (repeated words)
- words = title.lower().split()
- word_counts = Counter(words)
- max_repetition = max(word_counts.values()) if word_counts else 0
-
- if max_repetition > 3:
- issues.append("SEO: Title has keyword stuffing (repeated words)")
- suggestions.append("Remove repeated keywords, make title natural and readable")
- stuffing_score = 50.0
- elif max_repetition > 2:
- suggestions.append("Reduce word repetition in title for better readability")
- stuffing_score = 75.0
- else:
- stuffing_score = 100.0
- score_components.append(stuffing_score)
-
- # 4. Capitalization (Title Case preferred)
- if title.isupper():
- issues.append("SEO: Title in ALL CAPS (reduces readability)")
- suggestions.append("Use Title Case for better readability")
- case_score = 50.0
- elif title.islower():
- issues.append("SEO: Title in lowercase (looks unprofessional)")
- suggestions.append("Use Title Case or Sentence case")
- case_score = 60.0
- else:
- case_score = 100.0
- score_components.append(case_score)
-
- final_score = np.mean(score_components)
- return final_score, issues, suggestions
-
- def _extract_keywords(self, title: str, description: str, top_n: int = 10) -> List[Dict]:
- """Extract top keywords using KeyBERT or fallback method"""
- combined_text = f"{title}. {description}"
-
- if self.keybert_model:
- try:
- keywords = self.keybert_model.extract_keywords(
- combined_text,
- keyphrase_ngram_range=(1, 2),
- stop_words='english',
- top_n=top_n
- )
- return [{'keyword': kw, 'score': round(score, 3)} for kw, score in keywords]
- except Exception as e:
- logger.warning(f"KeyBERT extraction failed: {e}, using fallback")
-
- # Fallback: simple word frequency
- words = re.findall(r'\b\w{4,}\b', combined_text.lower())
- word_freq = Counter(words).most_common(top_n)
- return [{'keyword': word, 'score': round(freq / len(words), 3)} for word, freq in word_freq]
-
- def _find_missing_high_value_terms(self, title: str, description: str, category: str) -> List[str]:
- """Identify missing high-value search terms that could improve discoverability"""
- combined_text = f"{title} {description}".lower()
- missing_terms = []
-
- for term_type, terms in self.high_value_terms.items():
- found = any(term in combined_text for term in terms)
- if not found and len(missing_terms) < 5:
- # Suggest one term from each category
- missing_terms.append(f"{term_type.replace('_', ' ')}: {terms[0]}")
-
- category_terms = self.category_keywords.get(category, [])
- for term in category_terms[:3]:
- if term not in combined_text and term not in missing_terms:
- missing_terms.append(f"category keyword: {term}")
-
- return missing_terms[:5] # Limit to 5 suggestions
-
- def generate_seo_report(self, product: Dict, seo_result: Dict) -> str:
- """Generate a human-readable SEO report"""
- report = []
- report.append(f"=== SEO Score: {seo_result['seo_score']}/100 ===\n")
-
- report.append("Score Breakdown:")
- for metric, score in seo_result['breakdown'].items():
- report.append(f" - {metric.replace('_', ' ').title()}: {score:.1f}/100")
-
- if seo_result['issues']:
- report.append("\nIssues Found:")
- for issue in seo_result['issues']:
- report.append(f" • {issue}")
-
- if seo_result['suggestions']:
- report.append("\nSuggestions:")
- for suggestion in seo_result['suggestions']:
- report.append(f" ✓ {suggestion}")
-
- if seo_result.get('extracted_keywords'):
- report.append("\nTop Keywords:")
- for kw in seo_result['extracted_keywords'][:5]:
- report.append(f" - {kw['keyword']} (score: {kw['score']})")
-
- if seo_result.get('missing_high_value_terms'):
- report.append("\nMissing High-Value Terms:")
- for term in seo_result['missing_high_value_terms']:
- report.append(f" + {term}")
-
- return "\n".join(report)
|