|
@@ -0,0 +1,744 @@
|
|
|
|
+# title_description_scorer.py
|
|
|
|
+import re
|
|
|
|
+import logging
|
|
|
|
+from typing import Dict, List, Tuple
|
|
|
|
+from collections import Counter
|
|
|
|
+import numpy as np
|
|
|
|
+from textblob import TextBlob
|
|
|
|
+import language_tool_python
|
|
|
|
+
|
|
|
|
+logger = logging.getLogger(__name__)
|
|
|
|
+
|
|
|
|
+class TitleDescriptionScorer:
|
|
|
|
+ """
|
|
|
|
+ Combined scorer for Titles (10%) and Descriptions (20%)
|
|
|
|
+ Total weight in system: 30%
|
|
|
|
+ """
|
|
|
|
+
|
|
|
|
+ def __init__(self, use_ai: bool = True):
|
|
|
|
+ self.use_ai = use_ai
|
|
|
|
+ self.nlp = None
|
|
|
|
+ self.sentence_model = None
|
|
|
|
+ self.grammar_tool = None
|
|
|
|
+
|
|
|
|
+ # Initialize models
|
|
|
|
+ self._initialize_models()
|
|
|
|
+
|
|
|
|
+ # Initialize AI service if available
|
|
|
|
+ if use_ai:
|
|
|
|
+ try:
|
|
|
|
+ from .gemini_service import GeminiAttributeService
|
|
|
|
+ self.ai_service = GeminiAttributeService()
|
|
|
|
+ except Exception as e:
|
|
|
|
+ logger.warning(f"Gemini service not available: {e}")
|
|
|
|
+ self.use_ai = False
|
|
|
|
+ self.ai_service = None
|
|
|
|
+
|
|
|
|
+ # Title scoring weights (10% total)
|
|
|
|
+ self.title_weights = {
|
|
|
|
+ 'length_optimization': 0.25, # 2.5%
|
|
|
|
+ 'brand_presence': 0.25, # 2.5%
|
|
|
|
+ 'keyword_inclusion': 0.25, # 2.5%
|
|
|
|
+ 'readability': 0.25 # 2.5%
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ # Description scoring weights (20% total)
|
|
|
|
+ self.description_weights = {
|
|
|
|
+ 'grammar_spelling': 0.25, # 5%
|
|
|
|
+ 'duplication': 0.20, # 4%
|
|
|
|
+ 'readability': 0.20, # 4%
|
|
|
|
+ 'completeness': 0.20, # 4%
|
|
|
|
+ 'structure': 0.15 # 3%
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ # Common brands for detection
|
|
|
|
+ self.common_brands = {
|
|
|
|
+ 'Electronics': ['Apple', 'Samsung', 'Sony', 'LG', 'Dell', 'HP', 'Lenovo', 'Microsoft', 'Google', 'Amazon'],
|
|
|
|
+ 'Clothing': ['Nike', 'Adidas', 'Puma', 'Reebok', 'Under Armour', 'Levi\'s', 'Gap', 'H&M', 'Zara'],
|
|
|
|
+ 'Home & Garden': ['IKEA', 'Wayfair', 'Ashley', 'Home Depot', 'Lowe\'s'],
|
|
|
|
+ 'Sports': ['Nike', 'Adidas', 'Puma', 'Reebok', 'Wilson', 'Spalding', 'Coleman']
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ # Spam/low-quality patterns
|
|
|
|
+ self.spam_patterns = [
|
|
|
|
+ r'!!!+', # Multiple exclamation marks
|
|
|
|
+ r'\b(buy now|click here|limited time|hurry|act fast)\b',
|
|
|
|
+ r'[A-Z]{5,}', # ALL CAPS words
|
|
|
|
+ r'(.)\1{3,}', # Repeated characters (aaaa)
|
|
|
|
+ r'\$\$+', # Multiple dollar signs
|
|
|
|
+ ]
|
|
|
|
+
|
|
|
|
+ def _initialize_models(self):
|
|
|
|
+ """Initialize NLP models with fallback handling"""
|
|
|
|
+ # Load spaCy
|
|
|
|
+ try:
|
|
|
|
+ import spacy
|
|
|
|
+ self.nlp = spacy.load("en_core_web_sm")
|
|
|
|
+ logger.info("spaCy model loaded successfully")
|
|
|
|
+ except Exception as e:
|
|
|
|
+ logger.warning(f"spaCy not available: {e}")
|
|
|
|
+ self.nlp = None
|
|
|
|
+
|
|
|
|
+ # Load Sentence Transformers for duplication
|
|
|
|
+ try:
|
|
|
|
+ from sentence_transformers import SentenceTransformer
|
|
|
|
+ self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
|
|
+ logger.info("Sentence transformer loaded successfully")
|
|
|
|
+ except Exception as e:
|
|
|
|
+ logger.warning(f"Sentence transformer not available: {e}")
|
|
|
|
+ self.sentence_model = None
|
|
|
|
+
|
|
|
|
+ # Load grammar checker
|
|
|
|
+ try:
|
|
|
|
+ self.grammar_tool = language_tool_python.LanguageTool('en-US')
|
|
|
|
+ logger.info("LanguageTool loaded successfully")
|
|
|
|
+ except Exception as e:
|
|
|
|
+ logger.warning(f"LanguageTool not available: {e}")
|
|
|
|
+ self.grammar_tool = None
|
|
|
|
+
|
|
|
|
+ def score_title_and_description(
|
|
|
|
+ self,
|
|
|
|
+ product: Dict,
|
|
|
|
+ category_rules: List[Dict]
|
|
|
|
+ ) -> Dict:
|
|
|
|
+ """
|
|
|
|
+ Main scoring function for titles and descriptions
|
|
|
|
+ Returns combined scores, issues, and suggestions
|
|
|
|
+ """
|
|
|
|
+ try:
|
|
|
|
+ title = product.get('title', '')
|
|
|
|
+ description = product.get('description', '')
|
|
|
|
+ category = product.get('category', '')
|
|
|
|
+ attributes = product.get('attributes', {})
|
|
|
|
+
|
|
|
|
+ # Score title (10%)
|
|
|
|
+ title_result = self._score_title(title, category, attributes)
|
|
|
|
+
|
|
|
|
+ # Score description (20%)
|
|
|
|
+ description_result = self._score_description(description, title, attributes, category)
|
|
|
|
+
|
|
|
|
+ # Combine results
|
|
|
|
+ combined_score = (
|
|
|
|
+ title_result['title_score'] * 0.33 + # 10% of 30% = 33.33% of this component
|
|
|
|
+ description_result['description_score'] * 0.67 # 20% of 30% = 66.67% of this component
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+ return {
|
|
|
|
+ 'combined_score': round(combined_score, 2),
|
|
|
|
+ 'title_score': title_result['title_score'],
|
|
|
|
+ 'description_score': description_result['description_score'],
|
|
|
|
+ 'title_breakdown': title_result['breakdown'],
|
|
|
|
+ 'description_breakdown': description_result['breakdown'],
|
|
|
|
+ 'issues': title_result['issues'] + description_result['issues'],
|
|
|
|
+ 'suggestions': title_result['suggestions'] + description_result['suggestions'],
|
|
|
|
+ 'ai_improvements': self._get_ai_improvements(product, title_result, description_result) if self.use_ai else None
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ except Exception as e:
|
|
|
|
+ logger.error(f"Title/Description scoring error: {e}", exc_info=True)
|
|
|
|
+ return {
|
|
|
|
+ 'combined_score': 0.0,
|
|
|
|
+ 'title_score': 0.0,
|
|
|
|
+ 'description_score': 0.0,
|
|
|
|
+ 'issues': [f"Scoring failed: {str(e)}"],
|
|
|
|
+ 'suggestions': []
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ def _score_title(self, title: str, category: str, attributes: Dict) -> Dict:
|
|
|
|
+ """Score title quality (10% weight)"""
|
|
|
|
+ scores = {}
|
|
|
|
+ issues = []
|
|
|
|
+ suggestions = []
|
|
|
|
+
|
|
|
|
+ # 1. Length Optimization (25% of title score)
|
|
|
|
+ length_score, length_issues, length_suggestions = self._check_title_length(title)
|
|
|
|
+ scores['length_optimization'] = length_score
|
|
|
|
+ issues.extend(length_issues)
|
|
|
|
+ suggestions.extend(length_suggestions)
|
|
|
|
+
|
|
|
|
+ # 2. Brand Presence (25% of title score)
|
|
|
|
+ brand_score, brand_issues, brand_suggestions = self._check_brand_presence(title, category, attributes)
|
|
|
|
+ scores['brand_presence'] = brand_score
|
|
|
|
+ issues.extend(brand_issues)
|
|
|
|
+ suggestions.extend(brand_suggestions)
|
|
|
|
+
|
|
|
|
+ # 3. Keyword Inclusion (25% of title score)
|
|
|
|
+ keyword_score, keyword_issues, keyword_suggestions = self._check_title_keywords(title, attributes)
|
|
|
|
+ scores['keyword_inclusion'] = keyword_score
|
|
|
|
+ issues.extend(keyword_issues)
|
|
|
|
+ suggestions.extend(keyword_suggestions)
|
|
|
|
+
|
|
|
|
+ # 4. Readability (25% of title score)
|
|
|
|
+ readability_score, readability_issues, readability_suggestions = self._check_title_readability(title)
|
|
|
|
+ scores['readability'] = readability_score
|
|
|
|
+ issues.extend(readability_issues)
|
|
|
|
+ suggestions.extend(readability_suggestions)
|
|
|
|
+
|
|
|
|
+ # Calculate final title score
|
|
|
|
+ final_score = sum(scores[key] * self.title_weights[key] for key in scores)
|
|
|
|
+
|
|
|
|
+ return {
|
|
|
|
+ 'title_score': round(final_score, 2),
|
|
|
|
+ 'breakdown': scores,
|
|
|
|
+ 'issues': issues,
|
|
|
|
+ 'suggestions': suggestions
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ def _score_description(
|
|
|
|
+ self,
|
|
|
|
+ description: str,
|
|
|
|
+ title: str,
|
|
|
|
+ attributes: Dict,
|
|
|
|
+ category: str
|
|
|
|
+ ) -> Dict:
|
|
|
|
+ """Score description quality (20% weight)"""
|
|
|
|
+ scores = {}
|
|
|
|
+ issues = []
|
|
|
|
+ suggestions = []
|
|
|
|
+
|
|
|
|
+ # 1. Grammar & Spelling (25% of description score)
|
|
|
|
+ grammar_score, grammar_issues, grammar_suggestions = self._check_grammar_spelling(description)
|
|
|
|
+ scores['grammar_spelling'] = grammar_score
|
|
|
|
+ issues.extend(grammar_issues)
|
|
|
|
+ suggestions.extend(grammar_suggestions)
|
|
|
|
+
|
|
|
|
+ # 2. Duplication Detection (20% of description score)
|
|
|
|
+ duplication_score, dup_issues, dup_suggestions = self._check_duplication(description, title)
|
|
|
|
+ scores['duplication'] = duplication_score
|
|
|
|
+ issues.extend(dup_issues)
|
|
|
|
+ suggestions.extend(dup_suggestions)
|
|
|
|
+
|
|
|
|
+ # 3. Readability (20% of description score)
|
|
|
|
+ readability_score, read_issues, read_suggestions = self._check_description_readability(description)
|
|
|
|
+ scores['readability'] = readability_score
|
|
|
|
+ issues.extend(read_issues)
|
|
|
|
+ suggestions.extend(read_suggestions)
|
|
|
|
+
|
|
|
|
+ # 4. Completeness (20% of description score)
|
|
|
|
+ completeness_score, comp_issues, comp_suggestions = self._check_completeness(description, attributes, category)
|
|
|
|
+ scores['completeness'] = completeness_score
|
|
|
|
+ issues.extend(comp_issues)
|
|
|
|
+ suggestions.extend(comp_suggestions)
|
|
|
|
+
|
|
|
|
+ # 5. Structure (15% of description score)
|
|
|
|
+ structure_score, struct_issues, struct_suggestions = self._check_description_structure(description)
|
|
|
|
+ scores['structure'] = structure_score
|
|
|
|
+ issues.extend(struct_issues)
|
|
|
|
+ suggestions.extend(struct_suggestions)
|
|
|
|
+
|
|
|
|
+ # Calculate final description score
|
|
|
|
+ final_score = sum(scores[key] * self.description_weights[key] for key in scores)
|
|
|
|
+
|
|
|
|
+ return {
|
|
|
|
+ 'description_score': round(final_score, 2),
|
|
|
|
+ 'breakdown': scores,
|
|
|
|
+ 'issues': issues,
|
|
|
|
+ 'suggestions': suggestions
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ # ============== TITLE SCORING METHODS ==============
|
|
|
|
+
|
|
|
|
+ def _check_title_length(self, title: str) -> Tuple[float, List[str], List[str]]:
|
|
|
|
+ """Check optimal title length (50-100 characters)"""
|
|
|
|
+ issues = []
|
|
|
|
+ suggestions = []
|
|
|
|
+ length = len(title)
|
|
|
|
+
|
|
|
|
+ if length < 20:
|
|
|
|
+ issues.append(f"Title: Too short ({length} chars, minimum 20)")
|
|
|
|
+ suggestions.append("Expand title to 50-100 characters with key product details")
|
|
|
|
+ score = (length / 20) * 100
|
|
|
|
+ elif length < 50:
|
|
|
|
+ suggestions.append("Consider expanding title to 50-100 characters for better SEO")
|
|
|
|
+ score = 70 + (length - 20) / 30 * 30 # 70-100 score
|
|
|
|
+ elif length <= 100:
|
|
|
|
+ score = 100.0
|
|
|
|
+ elif length <= 150:
|
|
|
|
+ suggestions.append("Title slightly long, consider shortening to 100 characters")
|
|
|
|
+ score = 90 - (length - 100) / 50 * 20 # 90-70 score
|
|
|
|
+ else:
|
|
|
|
+ issues.append(f"Title: Too long ({length} chars, maximum 150)")
|
|
|
|
+ suggestions.append("Shorten title to 50-100 characters, prioritize key features")
|
|
|
|
+ score = 50.0
|
|
|
|
+
|
|
|
|
+ return score, issues, suggestions
|
|
|
|
+
|
|
|
|
+ def _check_brand_presence(self, title: str, category: str, attributes: Dict) -> Tuple[float, List[str], List[str]]:
|
|
|
|
+ """Check if brand is present in title"""
|
|
|
|
+ issues = []
|
|
|
|
+ suggestions = []
|
|
|
|
+
|
|
|
|
+ # Get brand from attributes
|
|
|
|
+ brand_attr = attributes.get('brand', '')
|
|
|
|
+
|
|
|
|
+ if not brand_attr:
|
|
|
|
+ issues.append("Title: No brand found in attributes")
|
|
|
|
+ suggestions.append("Add brand attribute to product")
|
|
|
|
+ return 50.0, issues, suggestions
|
|
|
|
+
|
|
|
|
+ title_lower = title.lower()
|
|
|
|
+ brand_lower = str(brand_attr).lower()
|
|
|
|
+
|
|
|
|
+ # Check direct presence
|
|
|
|
+ if brand_lower in title_lower:
|
|
|
|
+ return 100.0, issues, suggestions
|
|
|
|
+
|
|
|
|
+ # Check if any common brand is present
|
|
|
|
+ category_brands = self.common_brands.get(category, [])
|
|
|
|
+ found_brand = any(brand.lower() in title_lower for brand in category_brands)
|
|
|
|
+
|
|
|
|
+ if found_brand:
|
|
|
|
+ return 80.0, issues, suggestions
|
|
|
|
+
|
|
|
|
+ # Use spaCy NER for brand detection
|
|
|
|
+ if self.nlp:
|
|
|
|
+ doc = self.nlp(title)
|
|
|
|
+ orgs = [ent.text.lower() for ent in doc.ents if ent.label_ == 'ORG']
|
|
|
|
+ if brand_lower in orgs or any(brand.lower() in orgs for brand in category_brands):
|
|
|
|
+ return 90.0, issues, suggestions
|
|
|
|
+
|
|
|
|
+ issues.append(f"Title: Brand '{brand_attr}' not clearly mentioned")
|
|
|
|
+ suggestions.append(f"Add brand name '{brand_attr}' to title start")
|
|
|
|
+ return 30.0, issues, suggestions
|
|
|
|
+
|
|
|
|
+ def _check_title_keywords(self, title: str, attributes: Dict) -> Tuple[float, List[str], List[str]]:
|
|
|
|
+ """Check presence of key attributes in title"""
|
|
|
|
+ issues = []
|
|
|
|
+ suggestions = []
|
|
|
|
+
|
|
|
|
+ key_attributes = ['brand', 'model', 'color', 'size', 'material']
|
|
|
|
+ present_count = 0
|
|
|
|
+ missing_attrs = []
|
|
|
|
+
|
|
|
|
+ title_lower = title.lower()
|
|
|
|
+
|
|
|
|
+ for attr in key_attributes:
|
|
|
|
+ value = attributes.get(attr)
|
|
|
|
+ if value and str(value).lower() in title_lower:
|
|
|
|
+ present_count += 1
|
|
|
|
+ elif value:
|
|
|
|
+ missing_attrs.append(f"{attr}: {value}")
|
|
|
|
+
|
|
|
|
+ if present_count == 0:
|
|
|
|
+ issues.append("Title: No key attributes found")
|
|
|
|
+ suggestions.append("Include at least 2-3 key attributes (brand, model, color)")
|
|
|
|
+ score = 20.0
|
|
|
|
+ elif present_count == 1:
|
|
|
|
+ suggestions.append(f"Consider adding more attributes: {', '.join(missing_attrs[:2])}")
|
|
|
|
+ score = 50.0
|
|
|
|
+ elif present_count == 2:
|
|
|
|
+ score = 75.0
|
|
|
|
+ else:
|
|
|
|
+ score = 100.0
|
|
|
|
+
|
|
|
|
+ return score, issues, suggestions
|
|
|
|
+
|
|
|
|
+ def _check_title_readability(self, title: str) -> Tuple[float, List[str], List[str]]:
|
|
|
|
+ """Check title readability and quality"""
|
|
|
|
+ issues = []
|
|
|
|
+ suggestions = []
|
|
|
|
+ score_components = []
|
|
|
|
+
|
|
|
|
+ # 1. Check for spam patterns
|
|
|
|
+ spam_found = any(re.search(pattern, title, re.IGNORECASE) for pattern in self.spam_patterns)
|
|
|
|
+ if spam_found:
|
|
|
|
+ issues.append("Title: Contains spam-like patterns (excessive caps, multiple punctuation)")
|
|
|
|
+ suggestions.append("Remove spam indicators, use professional language")
|
|
|
|
+ score_components.append(30.0)
|
|
|
|
+ else:
|
|
|
|
+ score_components.append(100.0)
|
|
|
|
+
|
|
|
|
+ # 2. Check capitalization
|
|
|
|
+ if title.isupper():
|
|
|
|
+ issues.append("Title: All uppercase (poor readability)")
|
|
|
|
+ suggestions.append("Use Title Case or Sentence case")
|
|
|
|
+ score_components.append(40.0)
|
|
|
|
+ elif title.islower():
|
|
|
|
+ issues.append("Title: All lowercase (unprofessional)")
|
|
|
|
+ suggestions.append("Use Title Case capitalization")
|
|
|
|
+ score_components.append(60.0)
|
|
|
|
+ else:
|
|
|
|
+ score_components.append(100.0)
|
|
|
|
+
|
|
|
|
+ # 3. Check word count (optimal: 8-15 words)
|
|
|
|
+ word_count = len(title.split())
|
|
|
|
+ if word_count < 5:
|
|
|
|
+ suggestions.append("Title too few words, expand with descriptive terms")
|
|
|
|
+ score_components.append(60.0)
|
|
|
|
+ elif word_count > 20:
|
|
|
|
+ suggestions.append("Title too wordy, focus on essential information")
|
|
|
|
+ score_components.append(70.0)
|
|
|
|
+ else:
|
|
|
|
+ score_components.append(100.0)
|
|
|
|
+
|
|
|
|
+ # 4. Check for numbers/symbols abuse
|
|
|
|
+ special_char_ratio = sum(not c.isalnum() and c != ' ' for c in title) / max(len(title), 1)
|
|
|
|
+ if special_char_ratio > 0.2:
|
|
|
|
+ issues.append("Title: Excessive special characters")
|
|
|
|
+ suggestions.append("Reduce special characters, focus on clear product description")
|
|
|
|
+ score_components.append(50.0)
|
|
|
|
+ else:
|
|
|
|
+ score_components.append(100.0)
|
|
|
|
+
|
|
|
|
+ final_score = np.mean(score_components)
|
|
|
|
+ return final_score, issues, suggestions
|
|
|
|
+
|
|
|
|
+ # ============== DESCRIPTION SCORING METHODS ==============
|
|
|
|
+
|
|
|
|
+ def _check_grammar_spelling(self, description: str) -> Tuple[float, List[str], List[str]]:
|
|
|
|
+ """Check grammar and spelling using TextBlob and LanguageTool"""
|
|
|
|
+ issues = []
|
|
|
|
+ suggestions = []
|
|
|
|
+
|
|
|
|
+ if not description or len(description.strip()) < 10:
|
|
|
|
+ issues.append("Description: Too short or empty")
|
|
|
|
+ suggestions.append("Write a detailed description (50-150 words)")
|
|
|
|
+ return 0.0, issues, suggestions
|
|
|
|
+
|
|
|
|
+ error_count = 0
|
|
|
|
+
|
|
|
|
+ # Method 1: LanguageTool (more accurate)
|
|
|
|
+ if self.grammar_tool:
|
|
|
|
+ try:
|
|
|
|
+ matches = self.grammar_tool.check(description)
|
|
|
|
+ error_count = len(matches)
|
|
|
|
+
|
|
|
|
+ if error_count > 10:
|
|
|
|
+ issues.append(f"Description: {error_count} grammar/spelling errors found")
|
|
|
|
+ suggestions.append("Review and correct grammar errors")
|
|
|
|
+ elif error_count > 5:
|
|
|
|
+ suggestions.append(f"{error_count} minor grammar issues found, consider reviewing")
|
|
|
|
+
|
|
|
|
+ except Exception as e:
|
|
|
|
+ logger.warning(f"LanguageTool error: {e}")
|
|
|
|
+
|
|
|
|
+ # Method 2: TextBlob fallback
|
|
|
|
+ else:
|
|
|
|
+ try:
|
|
|
|
+ blob = TextBlob(description)
|
|
|
|
+ # Count words not in dictionary as potential spelling errors
|
|
|
|
+ words = description.split()
|
|
|
|
+ misspelled = sum(1 for word in words if word.isalpha() and word.lower() not in blob.words)
|
|
|
|
+ error_count = misspelled
|
|
|
|
+
|
|
|
|
+ if error_count > 5:
|
|
|
|
+ issues.append(f"Description: ~{error_count} potential spelling errors")
|
|
|
|
+ suggestions.append("Run spell-check and correct misspellings")
|
|
|
|
+
|
|
|
|
+ except Exception as e:
|
|
|
|
+ logger.warning(f"TextBlob error: {e}")
|
|
|
|
+ return 80.0, issues, suggestions # Default score if both fail
|
|
|
|
+
|
|
|
|
+ # Calculate score
|
|
|
|
+ word_count = len(description.split())
|
|
|
|
+ error_ratio = error_count / max(word_count, 1)
|
|
|
|
+
|
|
|
|
+ if error_ratio == 0:
|
|
|
|
+ score = 100.0
|
|
|
|
+ elif error_ratio < 0.02: # < 2% errors
|
|
|
|
+ score = 95.0
|
|
|
|
+ elif error_ratio < 0.05: # < 5% errors
|
|
|
|
+ score = 85.0
|
|
|
|
+ elif error_ratio < 0.10: # < 10% errors
|
|
|
|
+ score = 70.0
|
|
|
|
+ else:
|
|
|
|
+ score = 50.0
|
|
|
|
+
|
|
|
|
+ return score, issues, suggestions
|
|
|
|
+
|
|
|
|
+ def _check_duplication(self, description: str, title: str) -> Tuple[float, List[str], List[str]]:
|
|
|
|
+ """Check for duplicated content and repetitive sentences"""
|
|
|
|
+ issues = []
|
|
|
|
+ suggestions = []
|
|
|
|
+
|
|
|
|
+ if not description or len(description.strip()) < 20:
|
|
|
|
+ return 100.0, issues, suggestions
|
|
|
|
+
|
|
|
|
+ # 1. Check title duplication in description
|
|
|
|
+ title_words = set(title.lower().split())
|
|
|
|
+ desc_words = description.lower().split()
|
|
|
|
+ desc_word_set = set(desc_words)
|
|
|
|
+
|
|
|
|
+ overlap = len(title_words & desc_word_set) / len(title_words) if title_words else 0
|
|
|
|
+
|
|
|
|
+ if overlap > 0.8:
|
|
|
|
+ issues.append("Description: Mostly duplicates title content")
|
|
|
|
+ suggestions.append("Expand description with unique details not in title")
|
|
|
|
+ duplication_score = 40.0
|
|
|
|
+ elif overlap > 0.6:
|
|
|
|
+ suggestions.append("Description has significant overlap with title, add unique information")
|
|
|
|
+ duplication_score = 70.0
|
|
|
|
+ else:
|
|
|
|
+ duplication_score = 100.0
|
|
|
|
+
|
|
|
|
+ # 2. Check internal repetition (sentence similarity)
|
|
|
|
+ sentences = re.split(r'[.!?]+', description)
|
|
|
|
+ sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
|
|
|
|
+
|
|
|
|
+ if len(sentences) > 1 and self.sentence_model:
|
|
|
|
+ try:
|
|
|
|
+ embeddings = self.sentence_model.encode(sentences)
|
|
|
|
+
|
|
|
|
+ # Calculate cosine similarity between all pairs
|
|
|
|
+ from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
+ similarity_matrix = cosine_similarity(embeddings)
|
|
|
|
+
|
|
|
|
+ # Count high-similarity pairs (excluding diagonal)
|
|
|
|
+ high_similarity_count = 0
|
|
|
|
+ for i in range(len(similarity_matrix)):
|
|
|
|
+ for j in range(i + 1, len(similarity_matrix)):
|
|
|
|
+ if similarity_matrix[i][j] > 0.85: # Very similar
|
|
|
|
+ high_similarity_count += 1
|
|
|
|
+
|
|
|
|
+ if high_similarity_count > 2:
|
|
|
|
+ issues.append("Description: Contains repetitive/duplicate sentences")
|
|
|
|
+ suggestions.append("Remove duplicate sentences, provide varied information")
|
|
|
|
+ repetition_score = 50.0
|
|
|
|
+ elif high_similarity_count > 0:
|
|
|
|
+ suggestions.append("Some sentences are similar, consider diversifying content")
|
|
|
|
+ repetition_score = 75.0
|
|
|
|
+ else:
|
|
|
|
+ repetition_score = 100.0
|
|
|
|
+
|
|
|
|
+ except Exception as e:
|
|
|
|
+ logger.warning(f"Sentence similarity error: {e}")
|
|
|
|
+ repetition_score = 80.0
|
|
|
|
+ else:
|
|
|
|
+ # Fallback: Check for repeated phrases
|
|
|
|
+ words = desc_words
|
|
|
|
+ bigrams = [' '.join(words[i:i+2]) for i in range(len(words)-1)]
|
|
|
|
+ trigrams = [' '.join(words[i:i+3]) for i in range(len(words)-2)]
|
|
|
|
+
|
|
|
|
+ bigram_counts = Counter(bigrams)
|
|
|
|
+ trigram_counts = Counter(trigrams)
|
|
|
|
+
|
|
|
|
+ repeated_bigrams = sum(1 for count in bigram_counts.values() if count > 2)
|
|
|
|
+ repeated_trigrams = sum(1 for count in trigram_counts.values() if count > 1)
|
|
|
|
+
|
|
|
|
+ if repeated_trigrams > 3 or repeated_bigrams > 10:
|
|
|
|
+ issues.append("Description: Contains repeated phrases")
|
|
|
|
+ suggestions.append("Reduce repetitive phrasing, use varied vocabulary")
|
|
|
|
+ repetition_score = 60.0
|
|
|
|
+ else:
|
|
|
|
+ repetition_score = 90.0
|
|
|
|
+
|
|
|
|
+ final_score = (duplication_score * 0.5 + repetition_score * 0.5)
|
|
|
|
+ return final_score, issues, suggestions
|
|
|
|
+
|
|
|
|
+ def _check_description_readability(self, description: str) -> Tuple[float, List[str], List[str]]:
|
|
|
|
+ """Check description readability using Flesch Reading Ease"""
|
|
|
|
+ issues = []
|
|
|
|
+ suggestions = []
|
|
|
|
+
|
|
|
|
+ if not description or len(description.strip()) < 20:
|
|
|
|
+ issues.append("Description: Too short to evaluate readability")
|
|
|
|
+ return 50.0, issues, suggestions
|
|
|
|
+
|
|
|
|
+ try:
|
|
|
|
+ blob = TextBlob(description)
|
|
|
|
+
|
|
|
|
+ # Calculate Flesch Reading Ease
|
|
|
|
+ sentences = len(blob.sentences)
|
|
|
|
+ words = len(blob.words)
|
|
|
|
+ syllables = sum(self._count_syllables(str(word)) for word in blob.words)
|
|
|
|
+
|
|
|
|
+ if sentences == 0 or words == 0:
|
|
|
|
+ return 70.0, issues, suggestions
|
|
|
|
+
|
|
|
|
+ flesch_score = 206.835 - 1.015 * (words / sentences) - 84.6 * (syllables / words)
|
|
|
|
+
|
|
|
|
+ # Interpret Flesch score (0-100, higher is easier)
|
|
|
|
+ if flesch_score >= 60: # Easy to read
|
|
|
|
+ readability_score = 100.0
|
|
|
|
+ elif flesch_score >= 50: # Fairly easy
|
|
|
|
+ readability_score = 85.0
|
|
|
|
+ elif flesch_score >= 30: # Difficult
|
|
|
|
+ suggestions.append("Description readability is moderate, simplify complex sentences")
|
|
|
|
+ readability_score = 70.0
|
|
|
|
+ else: # Very difficult
|
|
|
|
+ issues.append("Description: Very difficult to read (complex sentences)")
|
|
|
|
+ suggestions.append("Simplify language, use shorter sentences and common words")
|
|
|
|
+ readability_score = 50.0
|
|
|
|
+
|
|
|
|
+ # Check average sentence length
|
|
|
|
+ avg_sentence_length = words / sentences
|
|
|
|
+ if avg_sentence_length > 25:
|
|
|
|
+ issues.append("Description: Sentences too long (reduce complexity)")
|
|
|
|
+ suggestions.append("Break long sentences into shorter ones (aim for 15-20 words)")
|
|
|
|
+ readability_score *= 0.9
|
|
|
|
+
|
|
|
|
+ return readability_score, issues, suggestions
|
|
|
|
+
|
|
|
|
+ except Exception as e:
|
|
|
|
+ logger.warning(f"Readability check error: {e}")
|
|
|
|
+ return 70.0, issues, suggestions
|
|
|
|
+
|
|
|
|
+ def _count_syllables(self, word: str) -> int:
|
|
|
|
+ """Count syllables in a word (simple approximation)"""
|
|
|
|
+ word = word.lower()
|
|
|
|
+ vowels = "aeiouy"
|
|
|
|
+ syllable_count = 0
|
|
|
|
+ previous_was_vowel = False
|
|
|
|
+
|
|
|
|
+ for char in word:
|
|
|
|
+ is_vowel = char in vowels
|
|
|
|
+ if is_vowel and not previous_was_vowel:
|
|
|
|
+ syllable_count += 1
|
|
|
|
+ previous_was_vowel = is_vowel
|
|
|
|
+
|
|
|
|
+ # Adjust for silent e
|
|
|
|
+ if word.endswith('e'):
|
|
|
|
+ syllable_count -= 1
|
|
|
|
+
|
|
|
|
+ # Ensure at least 1 syllable
|
|
|
|
+ if syllable_count == 0:
|
|
|
|
+ syllable_count = 1
|
|
|
|
+
|
|
|
|
+ return syllable_count
|
|
|
|
+
|
|
|
|
+ def _check_completeness(self, description: str, attributes: Dict, category: str) -> Tuple[float, List[str], List[str]]:
|
|
|
|
+ """Check if description covers essential product information"""
|
|
|
|
+ issues = []
|
|
|
|
+ suggestions = []
|
|
|
|
+
|
|
|
|
+ if not description or len(description.strip()) < 20:
|
|
|
|
+ issues.append("Description: Too short to be complete")
|
|
|
|
+ suggestions.append("Write comprehensive description covering features, benefits, specifications")
|
|
|
|
+ return 20.0, issues, suggestions
|
|
|
|
+
|
|
|
|
+ desc_lower = description.lower()
|
|
|
|
+
|
|
|
|
+ # Essential elements to check
|
|
|
|
+ essential_elements = {
|
|
|
|
+ 'features': ['feature', 'includes', 'has', 'offers', 'provides', 'equipped'],
|
|
|
|
+ 'benefits': ['benefit', 'advantage', 'helps', 'improves', 'enhances', 'perfect for'],
|
|
|
|
+ 'specifications': ['specification', 'spec', 'dimension', 'weight', 'size', 'capacity'],
|
|
|
|
+ 'use_case': ['use', 'ideal', 'suitable', 'designed for', 'great for', 'perfect for']
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ covered_elements = 0
|
|
|
|
+ missing_elements = []
|
|
|
|
+
|
|
|
|
+ for element, keywords in essential_elements.items():
|
|
|
|
+ if any(keyword in desc_lower for keyword in keywords):
|
|
|
|
+ covered_elements += 1
|
|
|
|
+ else:
|
|
|
|
+ missing_elements.append(element)
|
|
|
|
+
|
|
|
|
+ # Check attribute coverage
|
|
|
|
+ key_attrs = ['brand', 'model', 'color', 'size', 'material', 'warranty']
|
|
|
|
+ attrs_in_desc = sum(1 for attr in key_attrs if attr in attributes and str(attributes[attr]).lower() in desc_lower)
|
|
|
|
+
|
|
|
|
+ attr_coverage_score = (attrs_in_desc / len([a for a in key_attrs if a in attributes])) * 100 if attributes else 50.0
|
|
|
|
+ element_coverage_score = (covered_elements / len(essential_elements)) * 100
|
|
|
|
+
|
|
|
|
+ final_score = (attr_coverage_score * 0.4 + element_coverage_score * 0.6)
|
|
|
|
+
|
|
|
|
+ if covered_elements < 2:
|
|
|
|
+ issues.append(f"Description: Incomplete (missing: {', '.join(missing_elements)})")
|
|
|
|
+ suggestions.append("Add features, benefits, specifications, and use cases")
|
|
|
|
+ elif covered_elements < 3:
|
|
|
|
+ suggestions.append(f"Consider adding: {', '.join(missing_elements[:2])}")
|
|
|
|
+
|
|
|
|
+ if attrs_in_desc < 2 and len(attributes) > 2:
|
|
|
|
+ suggestions.append("Include more product attributes in description")
|
|
|
|
+
|
|
|
|
+ return final_score, issues, suggestions
|
|
|
|
+
|
|
|
|
+ def _check_description_structure(self, description: str) -> Tuple[float, List[str], List[str]]:
|
|
|
|
+ """Check description structure and formatting"""
|
|
|
|
+ issues = []
|
|
|
|
+ suggestions = []
|
|
|
|
+
|
|
|
|
+ if not description or len(description.strip()) < 20:
|
|
|
|
+ return 50.0, issues, suggestions
|
|
|
|
+
|
|
|
|
+ score_components = []
|
|
|
|
+
|
|
|
|
+ # 1. Check for proper sentences (not just bullet points)
|
|
|
|
+ sentences = re.split(r'[.!?]+', description)
|
|
|
|
+ complete_sentences = [s for s in sentences if len(s.split()) >= 5]
|
|
|
|
+
|
|
|
|
+ if len(complete_sentences) < 2:
|
|
|
|
+ issues.append("Description: Lacks proper sentence structure")
|
|
|
|
+ suggestions.append("Write in complete sentences, not just bullet points")
|
|
|
|
+ score_components.append(40.0)
|
|
|
|
+ else:
|
|
|
|
+ score_components.append(100.0)
|
|
|
|
+
|
|
|
|
+ # 2. Check for paragraph breaks (if long)
|
|
|
|
+ if len(description) > 300:
|
|
|
|
+ paragraph_breaks = description.count('\n\n') + description.count('\n')
|
|
|
|
+ if paragraph_breaks < 1:
|
|
|
|
+ suggestions.append("Break long description into paragraphs for readability")
|
|
|
|
+ score_components.append(70.0)
|
|
|
|
+ else:
|
|
|
|
+ score_components.append(100.0)
|
|
|
|
+ else:
|
|
|
|
+ score_components.append(100.0)
|
|
|
|
+
|
|
|
|
+ # 3. Check opening sentence quality
|
|
|
|
+ first_sentence = sentences[0].strip() if sentences else ""
|
|
|
|
+ if len(first_sentence.split()) < 5:
|
|
|
|
+ issues.append("Description: Weak opening sentence")
|
|
|
|
+ suggestions.append("Start with a strong, descriptive opening sentence")
|
|
|
|
+ score_components.append(60.0)
|
|
|
|
+ else:
|
|
|
|
+ score_components.append(100.0)
|
|
|
|
+
|
|
|
|
+ # 4. Check for call-to-action or conclusion
|
|
|
|
+ cta_keywords = ['order', 'buy', 'get', 'shop', 'add to cart', 'perfect', 'ideal', 'must-have']
|
|
|
|
+ has_cta = any(keyword in description.lower() for keyword in cta_keywords)
|
|
|
|
+
|
|
|
|
+ if not has_cta and len(description.split()) > 30:
|
|
|
|
+ suggestions.append("Consider adding a subtle call-to-action or conclusion")
|
|
|
|
+ score_components.append(85.0)
|
|
|
|
+ else:
|
|
|
|
+ score_components.append(100.0)
|
|
|
|
+
|
|
|
|
+ final_score = np.mean(score_components)
|
|
|
|
+ return final_score, issues, suggestions
|
|
|
|
+
|
|
|
|
+ def _get_ai_improvements(self, product: Dict, title_result: Dict, description_result: Dict) -> Dict:
|
|
|
|
+ """Use Gemini AI to generate improved title and description"""
|
|
|
|
+ if not self.use_ai or not self.ai_service:
|
|
|
|
+ return None
|
|
|
|
+
|
|
|
|
+ try:
|
|
|
|
+ # Combine all issues
|
|
|
|
+ all_issues = title_result['issues'] + description_result['issues']
|
|
|
|
+
|
|
|
|
+ if not all_issues:
|
|
|
|
+ return {"note": "No improvements needed"}
|
|
|
|
+
|
|
|
|
+ prompt = f"""Improve this product listing's title and description.
|
|
|
|
+
|
|
|
|
+CURRENT:
|
|
|
|
+Title: {product.get('title', '')}
|
|
|
|
+Description: {product.get('description', '')}
|
|
|
|
+Category: {product.get('category', '')}
|
|
|
|
+Attributes: {product.get('attributes', {})}
|
|
|
|
+
|
|
|
|
+ISSUES FOUND:
|
|
|
|
+{chr(10).join(f"• {issue}" for issue in all_issues[:10])}
|
|
|
|
+
|
|
|
|
+Return ONLY this JSON:
|
|
|
|
+{{
|
|
|
|
+ "improved_title": "optimized title 50-100 chars",
|
|
|
|
+ "improved_description": "enhanced description 50-150 words",
|
|
|
|
+ "changes_made": ["change1", "change2"],
|
|
|
|
+ "confidence": "high/medium/low"
|
|
|
|
+}}"""
|
|
|
|
+
|
|
|
|
+ response = self.ai_service._call_gemini_api(prompt, max_tokens=2048)
|
|
|
|
+
|
|
|
|
+ if response and response.candidates:
|
|
|
|
+ return self.ai_service._parse_response(response.text)
|
|
|
|
+
|
|
|
|
+ return {"error": "No AI response"}
|
|
|
|
+
|
|
|
|
+ except Exception as e:
|
|
|
|
+ logger.error(f"AI improvement error: {e}")
|
|
|
|
+ return {"error": str(e)}
|
|
|
|
+
|
|
|
|
+
|