harshit.pathak
/
content_quality_tool


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435
							# seo_scorer.py
import re
import logging
from typing import Dict, List, Tuple, Set
from collections import Counter
import numpy as np

logger = logging.getLogger(__name__)

class SEODiscoverabilityScorer:
    """
    SEO & Discoverability scoring (15% weight)
    Checks: Keyword coverage, semantic richness, backend keywords
    """
    
    def __init__(self):
        self.keybert_model = None
        self.sentence_model = None
        self._initialize_models()
        
        # SEO scoring weights
        self.weights = {
            'keyword_coverage': 0.35,      # Are key attributes in title/description?
            'semantic_richness': 0.30,     # Descriptive quality & vocabulary diversity
            'backend_keywords': 0.20,      # Presence of searchable backend terms
            'title_optimization': 0.15     # Title length, structure, readability
        }
        
        # Category-specific important keywords
        self.category_keywords = {
            'Electronics': ['brand', 'model', 'warranty', 'condition', 'specs', 'features', 'technology'],
            'Clothing': ['brand', 'size', 'color', 'material', 'fit', 'style', 'occasion', 'care'],
            'Home & Garden': ['material', 'dimensions', 'color', 'style', 'brand', 'indoor', 'outdoor'],
            'Sports': ['brand', 'size', 'sport', 'material', 'performance', 'level', 'gender']
        }
        
        # Common search terms users look for
        self.high_value_terms = {
            'quality_indicators': ['premium', 'high-quality', 'durable', 'professional', 'authentic', 'genuine'],
            'value_indicators': ['affordable', 'budget', 'value', 'economical', 'best', 'top-rated'],
            'feature_terms': ['lightweight', 'waterproof', 'wireless', 'adjustable', 'portable', 'compact'],
            'condition_terms': ['new', 'refurbished', 'used', 'like-new', 'open-box']
        }
    
    def _initialize_models(self):
        """Initialize NLP models with fallback handling"""
        try:
            from keybert import KeyBERT
            self.keybert_model = KeyBERT()
            logger.info("KeyBERT model loaded successfully")
        except Exception as e:
            logger.warning(f"KeyBERT not available: {e}. Using fallback keyword extraction.")
            self.keybert_model = None
        
        try:
            from sentence_transformers import SentenceTransformer
            self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
            logger.info("Sentence transformer model loaded successfully")
        except Exception as e:
            logger.warning(f"Sentence transformer not available: {e}. Using fallback semantic analysis.")
            self.sentence_model = None
    
    def score_seo(self, product: Dict, category_rules: List[Dict]) -> Dict:
        """
        Main SEO scoring function
        Returns: score breakdown, issues, and suggestions
        """
        try:
            title = product.get('title', '')
            description = product.get('description', '')
            category = product.get('category', '')
            attributes = product.get('attributes', {})
            
            scores = {}
            issues = []
            suggestions = []
            
            # 1. Keyword Coverage (35%)
            kw_score, kw_issues, kw_suggestions = self._check_keyword_coverage(
                title, description, attributes, category, category_rules
            )
            scores['keyword_coverage'] = kw_score
            issues.extend(kw_issues)
            suggestions.extend(kw_suggestions)
            
            # 2. Semantic Richness (30%)
            semantic_score, semantic_issues, semantic_suggestions = self._check_semantic_richness(
                title, description
            )
            scores['semantic_richness'] = semantic_score
            issues.extend(semantic_issues)
            suggestions.extend(semantic_suggestions)
            
            # 3. Backend Keywords (20%)
            backend_score, backend_issues, backend_suggestions = self._check_backend_keywords(
                title, description, attributes, category
            )
            scores['backend_keywords'] = backend_score
            issues.extend(backend_issues)
            suggestions.extend(backend_suggestions)
            
            # 4. Title Optimization (15%)
            title_score, title_issues, title_suggestions = self._check_title_optimization(
                title, attributes
            )
            scores['title_optimization'] = title_score
            issues.extend(title_issues)
            suggestions.extend(title_suggestions)
            
            # Calculate final SEO score
            final_score = sum(scores[key] * self.weights[key] for key in scores)
            
            return {
                'seo_score': round(final_score, 2),
                'breakdown': scores,
                'issues': issues,
                'suggestions': suggestions,
                'extracted_keywords': self._extract_keywords(title, description),
                'missing_high_value_terms': self._find_missing_high_value_terms(title, description, category)
            }
            
        except Exception as e:
            logger.error(f"SEO scoring error: {e}", exc_info=True)
            return {
                'seo_score': 0.0,
                'breakdown': {},
                'issues': [f"SEO scoring failed: {str(e)}"],
                'suggestions': []
            }
    
    def _check_keyword_coverage(
        self, 
        title: str, 
        description: str, 
        attributes: Dict, 
        category: str,
        rules: List[Dict]
    ) -> Tuple[float, List[str], List[str]]:
        """Check if key product attributes are mentioned in title/description"""
        issues = []
        suggestions = []
        
        combined_text = f"{title} {description}".lower()
        mandatory_attrs = [r['attribute_name'] for r in rules if r.get('is_mandatory')]
        
        covered_count = 0
        total_mandatory = len(mandatory_attrs)
        
        if total_mandatory == 0:
            return 100.0, [], []
        
        for attr_name in mandatory_attrs:
            attr_value = attributes.get(attr_name, '')
            
            if not attr_value:
                issues.append(f"SEO: Mandatory attribute '{attr_name}' is missing entirely")
                suggestions.append(f"Add {attr_name} to improve discoverability")
                continue
            
            attr_value_str = str(attr_value).lower()
            
            # Check if attribute value appears in title or description
            if attr_value_str in combined_text:
                covered_count += 1
            elif attr_name.lower() in combined_text:
                # Attribute name mentioned but not value
                covered_count += 0.5
                issues.append(f"SEO: '{attr_name}' mentioned but value '{attr_value}' not clearly stated")
                suggestions.append(f"Include specific {attr_name} '{attr_value}' in title or description")
            else:
                issues.append(f"SEO: Key attribute '{attr_name}: {attr_value}' not mentioned in title/description")
                suggestions.append(f"Add '{attr_name}: {attr_value}' to title or first line of description")
        
        score = (covered_count / total_mandatory) * 100 if total_mandatory > 0 else 100.0
        return score, issues, suggestions
    
    def _check_semantic_richness(
        self, 
        title: str, 
        description: str
    ) -> Tuple[float, List[str], List[str]]:
        """Evaluate descriptive quality and vocabulary diversity"""
        issues = []
        suggestions = []
        score_components = []
        
        # 1. Description length check
        desc_length = len(description.split())
        if desc_length < 20:
            issues.append(f"SEO: Description too short ({desc_length} words, recommended 50+)")
            suggestions.append("Expand description to 50-150 words for better SEO")
            length_score = (desc_length / 20) * 100
        elif desc_length > 300:
            issues.append(f"SEO: Description very long ({desc_length} words, may hurt readability)")
            suggestions.append("Consider condensing to 50-200 words for optimal engagement")
            length_score = 80.0
        else:
            length_score = 100.0
        score_components.append(length_score)
        
        # 2. Vocabulary diversity (unique words ratio)
        words = re.findall(r'\b\w+\b', description.lower())
        if words:
            unique_ratio = len(set(words)) / len(words)
            if unique_ratio < 0.5:
                issues.append("SEO: Low vocabulary diversity (repetitive text)")
                suggestions.append("Use more varied descriptive words to improve richness")
                diversity_score = unique_ratio * 100
            else:
                diversity_score = min(unique_ratio * 150, 100)  # Cap at 100
        else:
            diversity_score = 0.0
            issues.append("SEO: Empty or very short description")
            suggestions.append("Add a detailed product description")
        score_components.append(diversity_score)
        
        # 3. Adjective/descriptive word presence
        descriptive_patterns = [
            r'\b(premium|quality|durable|lightweight|comfortable|stylish|modern|classic)\b',
            r'\b(professional|authentic|genuine|original|certified|official)\b',
            r'\b(innovative|advanced|smart|efficient|powerful|reliable)\b'
        ]
        descriptive_count = sum(len(re.findall(pattern, description.lower())) for pattern in descriptive_patterns)
        
        if descriptive_count == 0:
            issues.append("SEO: No descriptive/quality adjectives found")
            suggestions.append("Add descriptive words like 'premium', 'durable', 'comfortable' to enhance appeal")
            descriptive_score = 0.0
        elif descriptive_count < 3:
            suggestions.append("Consider adding more descriptive adjectives for better engagement")
            descriptive_score = (descriptive_count / 3) * 100
        else:
            descriptive_score = 100.0
        score_components.append(descriptive_score)
        
        # 4. Sentence structure (not just bullet points)
        sentences = re.split(r'[.!?]+', description)
        complete_sentences = [s for s in sentences if len(s.split()) >= 5]
        if len(complete_sentences) < 2:
            issues.append("SEO: Description lacks complete sentences (use prose, not just bullet points)")
            suggestions.append("Write 2-3 complete sentences describing the product")
            structure_score = (len(complete_sentences) / 2) * 100
        else:
            structure_score = 100.0
        score_components.append(structure_score)
        
        final_score = np.mean(score_components)
        return final_score, issues, suggestions
    
    def _check_backend_keywords(
        self, 
        title: str, 
        description: str, 
        attributes: Dict,
        category: str
    ) -> Tuple[float, List[str], List[str]]:
        """Check for presence of searchable backend keywords"""
        issues = []
        suggestions = []
        
        combined_text = f"{title} {description}".lower()
        
        # Get category-specific keywords
        expected_keywords = self.category_keywords.get(category, [])
        
        present_count = 0
        for keyword in expected_keywords:
            if keyword in combined_text or keyword in str(attributes.values()).lower():
                present_count += 1
            else:
                issues.append(f"SEO: Missing common search term '{keyword}' for {category}")
                suggestions.append(f"Consider mentioning '{keyword}' if applicable to improve searchability")
        
        coverage_score = (present_count / len(expected_keywords)) * 100 if expected_keywords else 100.0
        
        # Check for high-value terms
        high_value_present = 0
        all_high_value = []
        for category_terms in self.high_value_terms.values():
            all_high_value.extend(category_terms)
        
        for term in all_high_value:
            if term in combined_text:
                high_value_present += 1
        
        if high_value_present == 0:
            issues.append("SEO: No high-value search terms found (e.g., 'premium', 'durable', 'best')")
            suggestions.append("Add 1-2 quality/value indicators to attract more searches")
            value_score = 0.0
        elif high_value_present < 2:
            suggestions.append("Consider adding more value-indicating terms for better positioning")
            value_score = (high_value_present / 2) * 100
        else:
            value_score = 100.0
        
        final_score = (coverage_score * 0.6 + value_score * 0.4)
        return final_score, issues, suggestions
    
    def _check_title_optimization(
        self, 
        title: str, 
        attributes: Dict
    ) -> Tuple[float, List[str], List[str]]:
        """Evaluate title quality for SEO"""
        issues = []
        suggestions = []
        score_components = []
        
        # 1. Title length (optimal: 50-100 characters)
        title_len = len(title)
        if title_len < 30:
            issues.append(f"SEO: Title too short ({title_len} chars, recommended 50-100)")
            suggestions.append("Expand title to include key attributes (brand, model, key features)")
            length_score = (title_len / 30) * 100
        elif title_len > 150:
            issues.append(f"SEO: Title too long ({title_len} chars, may be truncated in search)")
            suggestions.append("Shorten title to 50-100 characters, focus on key selling points")
            length_score = 70.0
        else:
            length_score = 100.0
        score_components.append(length_score)
        
        # 2. Key attributes in title
        key_attrs = ['brand', 'model', 'color', 'size']
        present_in_title = sum(1 for attr in key_attrs if attr in attributes and str(attributes[attr]).lower() in title.lower())
        
        if present_in_title < 2:
            issues.append("SEO: Title missing key attributes (brand, model, color, size)")
            suggestions.append("Include at least 2-3 key attributes in title")
            attr_score = (present_in_title / 2) * 100
        else:
            attr_score = 100.0
        score_components.append(attr_score)
        
        # 3. No keyword stuffing (repeated words)
        words = title.lower().split()
        word_counts = Counter(words)
        max_repetition = max(word_counts.values()) if word_counts else 0
        
        if max_repetition > 3:
            issues.append("SEO: Title has keyword stuffing (repeated words)")
            suggestions.append("Remove repeated keywords, make title natural and readable")
            stuffing_score = 50.0
        elif max_repetition > 2:
            suggestions.append("Reduce word repetition in title for better readability")
            stuffing_score = 75.0
        else:
            stuffing_score = 100.0
        score_components.append(stuffing_score)
        
        # 4. Capitalization (Title Case preferred)
        if title.isupper():
            issues.append("SEO: Title in ALL CAPS (reduces readability)")
            suggestions.append("Use Title Case for better readability")
            case_score = 50.0
        elif title.islower():
            issues.append("SEO: Title in lowercase (looks unprofessional)")
            suggestions.append("Use Title Case or Sentence case")
            case_score = 60.0
        else:
            case_score = 100.0
        score_components.append(case_score)
        
        final_score = np.mean(score_components)
        return final_score, issues, suggestions
    
    def _extract_keywords(self, title: str, description: str, top_n: int = 10) -> List[Dict]:
        """Extract top keywords using KeyBERT or fallback method"""
        combined_text = f"{title}. {description}"
        
        if self.keybert_model:
            try:
                keywords = self.keybert_model.extract_keywords(
                    combined_text,
                    keyphrase_ngram_range=(1, 2),
                    stop_words='english',
                    top_n=top_n
                )
                return [{'keyword': kw, 'score': round(score, 3)} for kw, score in keywords]
            except Exception as e:
                logger.warning(f"KeyBERT extraction failed: {e}, using fallback")
        
        # Fallback: simple word frequency
        words = re.findall(r'\b\w{4,}\b', combined_text.lower())
        word_freq = Counter(words).most_common(top_n)
        return [{'keyword': word, 'score': round(freq / len(words), 3)} for word, freq in word_freq]
    
    def _find_missing_high_value_terms(self, title: str, description: str, category: str) -> List[str]:
        """Identify missing high-value search terms that could improve discoverability"""
        combined_text = f"{title} {description}".lower()
        missing_terms = []
        
        for term_type, terms in self.high_value_terms.items():
            found = any(term in combined_text for term in terms)
            if not found and len(missing_terms) < 5:
                # Suggest one term from each category
                missing_terms.append(f"{term_type.replace('_', ' ')}: {terms[0]}")
        
        category_terms = self.category_keywords.get(category, [])
        for term in category_terms[:3]:
            if term not in combined_text and term not in missing_terms:
                missing_terms.append(f"category keyword: {term}")
        
        return missing_terms[:5]  # Limit to 5 suggestions
    
    def generate_seo_report(self, product: Dict, seo_result: Dict) -> str:
        """Generate a human-readable SEO report"""
        report = []
        report.append(f"=== SEO Score: {seo_result['seo_score']}/100 ===\n")
        
        report.append("Score Breakdown:")
        for metric, score in seo_result['breakdown'].items():
            report.append(f"  - {metric.replace('_', ' ').title()}: {score:.1f}/100")
        
        if seo_result['issues']:
            report.append("\nIssues Found:")
            for issue in seo_result['issues']:
                report.append(f"  • {issue}")
        
        if seo_result['suggestions']:
            report.append("\nSuggestions:")
            for suggestion in seo_result['suggestions']:
                report.append(f"  ✓ {suggestion}")
        
        if seo_result.get('extracted_keywords'):
            report.append("\nTop Keywords:")
            for kw in seo_result['extracted_keywords'][:5]:
                report.append(f"  - {kw['keyword']} (score: {kw['score']})")
        
        if seo_result.get('missing_high_value_terms'):
            report.append("\nMissing High-Value Terms:")
            for term in seo_result['missing_high_value_terms']:
                report.append(f"  + {term}")
        
        return "\n".join(report)