Harshit Pathak 15 時間 前
コミット
f540386406
42 ファイル変更3579 行追加0 行削除
  1. 186 0
      architecture.txt
  2. 0 0
      content_quality_tool/__init__.py
  3. BIN
      content_quality_tool/__pycache__/__init__.cpython-313.pyc
  4. BIN
      content_quality_tool/__pycache__/settings.cpython-313.pyc
  5. BIN
      content_quality_tool/__pycache__/urls.cpython-313.pyc
  6. BIN
      content_quality_tool/__pycache__/wsgi.cpython-313.pyc
  7. 16 0
      content_quality_tool/asgi.py
  8. 130 0
      content_quality_tool/settings.py
  9. 25 0
      content_quality_tool/urls.py
  10. 16 0
      content_quality_tool/wsgi.py
  11. 0 0
      core/__init__.py
  12. BIN
      core/__pycache__/__init__.cpython-313.pyc
  13. BIN
      core/__pycache__/admin.cpython-313.pyc
  14. BIN
      core/__pycache__/apps.cpython-313.pyc
  15. BIN
      core/__pycache__/models.cpython-313.pyc
  16. BIN
      core/__pycache__/urls.cpython-313.pyc
  17. BIN
      core/__pycache__/views.cpython-313.pyc
  18. 36 0
      core/admin.py
  19. 6 0
      core/apps.py
  20. BIN
      core/management/commands/__pycache__/load_sample_data.cpython-313.pyc
  21. 34 0
      core/management/commands/load_sample_data.py
  22. 55 0
      core/migrations/0001_initial.py
  23. 59 0
      core/migrations/0002_attributescore_ai_suggestions_and_more.py
  24. 0 0
      core/migrations/__init__.py
  25. BIN
      core/migrations/__pycache__/0001_initial.cpython-313.pyc
  26. BIN
      core/migrations/__pycache__/0002_attributescore_ai_suggestions_and_more.cpython-313.pyc
  27. BIN
      core/migrations/__pycache__/__init__.cpython-313.pyc
  28. 64 0
      core/models.py
  29. BIN
      core/services/__pycache__/attribute_scorer.cpython-313.pyc
  30. BIN
      core/services/__pycache__/gemini_service.cpython-313.pyc
  31. BIN
      core/services/__pycache__/seo_scorer.cpython-313.pyc
  32. 828 0
      core/services/attribute_scorer.py
  33. 1084 0
      core/services/gemini_service.py
  34. 435 0
      core/services/seo_scorer.py
  35. 3 0
      core/tests.py
  36. 11 0
      core/urls.py
  37. 404 0
      core/views.py
  38. BIN
      data/__pycache__/sample_data.cpython-313.pyc
  39. 137 0
      data/sample_data.py
  40. BIN
      db.sqlite3
  41. 22 0
      manage.py
  42. 28 0
      requirements.txt

+ 186 - 0
architecture.txt

@@ -0,0 +1,186 @@
+         ┌─────────────────────┐
+         │   Incoming Product  │
+         │   (via API POST)    │
+         └─────────┬──────────┘
+                   │
+                   ▼
+      ┌───────────────────────────┐
+      │  Validate SKU & Category  │
+      └─────────┬─────────────────┘
+                │
+                ▼
+       ┌────────────────────────┐
+       │  Fetch/Create Product  │
+       │  from Database         │
+       └─────────┬─────────────┘
+                 │
+                 ▼
+      ┌────────────────────────────┐
+      │  Get Category Rules (Cache) │
+      └─────────┬──────────────────┘
+                │
+                ▼
+     ┌─────────────────────────────┐
+     │  AttributeQualityScorer      │
+     │  (score_product method)      │
+     └─────────┬───────────────────┘
+               │
+               ▼
+ ┌────────────────────────────────────────┐
+ │  Step 1: Check Mandatory Fields       │
+ │  Step 2: Check Standardization        │
+ │  Step 3: Check Missing Values         │
+ │  Step 4: Check Consistency            │
+ └─────────┬─────────────────────────────┘
+           │
+           ▼
+ ┌────────────────────────────────────────┐
+ │ Calculate Weighted Final Score         │
+ │  - mandatory_fields * 0.4             │
+ │  - standardization * 0.3              │
+ │  - missing_values * 0.2               │
+ │  - consistency * 0.1                  │
+ └─────────┬─────────────────────────────┘
+           │
+           ▼
+ ┌────────────────────────────────────────┐
+ │  Generate AI Suggestions (Optional)    │
+ │  - Uses Gemini service                  │
+ │  - Suggest fixes for issues            │
+ └─────────┬─────────────────────────────┘
+           │
+           ▼
+ ┌────────────────────────────────────────┐
+ │  Save AttributeScore in Database       │
+ │  - final_score, breakdown, issues     │
+ │  - suggestions, ai_suggestions        │
+ └─────────┬─────────────────────────────┘
+           │
+           ▼
+ ┌────────────────────────────────────────┐
+ │      Return JSON Response to Client    │
+ │  {success, product_sku, score_result} │
+ └────────────────────────────────────────┘
+
+
+
+
+
+
+
+
+       ┌─────────────────────┐
+       │ Product Description │
+       └─────────┬──────────┘
+                 │
+                 ▼
+          ┌─────────────┐
+          │  spaCy NER  │
+          │ Extract:    │
+          │ - Brand     │
+          │ - Size      │
+          │ - Product   │
+          └─────┬───────┘
+                │
+                ▼
+        ┌───────────────────┐
+        │ AI Extraction      │
+        │ (Gemini Service)   │
+        └─────┬─────────────┘
+              │
+              ▼
+       ┌───────────────────┐
+       │ Return Attributes │
+       │ as Dict           │
+       └───────────────────┘
+
+
+
+
+
+
+FOR SEO:
+
+hybrid approach combining KeyBERT for keyword extraction, 
+sentence-transformers for semantic analysis, 
+and existing Gemini API for intelligent SEO suggestions.
+
+
+
+
+
+# SEO & Discoverability Implementation Summary
+
+## 📋 What Was Implemented
+
+### Core Feature: SEO & Discoverability Scoring (15% weight)
+
+A comprehensive SEO scoring system that evaluates product listings for search engine optimization and customer discoverability across 4 key dimensions:
+
+| Dimension | Weight | What It Checks |
+|-----------|--------|----------------|
+| **Keyword Coverage** | 35% | Are mandatory attributes mentioned in title/description? |
+| **Semantic Richness** | 30% | Description quality, vocabulary diversity, descriptive language |
+| **Backend Keywords** | 20% | Presence of high-value search terms and category keywords |
+| **Title Optimization** | 15% | Title length (50-100 chars), structure, no keyword stuffing |
+
+## 🎯 Why This Approach?
+
+### Technology Stack Chosen
+
+| Technology | Purpose | Why This Choice |
+|------------|---------|-----------------|
+| **KeyBERT** | Keyword extraction | Fast, accurate, open-source. Best for e-commerce SEO |
+| **Sentence-Transformers** | Semantic similarity | Lightweight, pre-trained models. Better than full LLMs |
+| **Google Gemini** | AI suggestions | Already in your stack. Provides context-aware recommendations |
+| **spaCy** | NLP preprocessing | Fast entity recognition, existing in your code |
+| **RapidFuzz** | Fuzzy matching | Existing dependency, handles typos well |
+
+### Alternatives Considered & Rejected
+
+❌ **OpenAI GPT** - Too expensive ($0.02/1k tokens), slower, overkill for this use case  
+❌ **SEMrush/Ahrefs** - $100-500/month, external API, limited customization  
+❌ **LLaMA 2** - Requires GPU, complex setup, slower inference  
+❌ **Full BERT models** - Too heavy, KeyBERT uses lighter sentence transformers  
+
+## 📊 Integration Architecture
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                     API Request (views.py)                   │
+└───────────────────────────┬─────────────────────────────────┘
+                            │
+                            ▼
+┌─────────────────────────────────────────────────────────────┐
+│          AttributeQualityScorer (attribute_scorer.py)        │
+│  ┌──────────────────────────────────────────────────────┐   │
+│  │ Mandatory Fields (34%)                                │   │
+│  │ Standardization (26%)                                 │   │
+│  │ Missing Values (17%)                                  │   │
+│  │ Consistency (8%)                                      │   │
+│  │ ┌────────────────────────────────────────────────┐   │   │
+│  │ │ SEO & Discoverability (15%) ← NEW              │   │   │
+│  │ │  ├─ Keyword Coverage (35%)                      │   │   │
+│  │ │  ├─ Semantic Richness (30%)                     │   │   │
+│  │ │  ├─ Backend Keywords (20%)                      │   │   │
+│  │ │  └─ Title Optimization (15%)                    │   │   │
+│  │ └────────────────────────────────────────────────┘   │   │
+│  └──────────────────────────────────────────────────────┘   │
+└───────────────────────────┬─────────────────────────────────┘
+                            │
+                            ├──────────────────┐
+                            │                  │
+                            ▼                  ▼
+              ┌───────────────────┐  ┌──────────────────┐
+              │  SEOScorer        │  │ GeminiService    │
+              │  (seo_scorer.py)  │  │ (AI Suggestions) │
+              │                   │  │                  │
+              │ ├─ KeyBERT        │  │ Enhanced with    │
+              │ ├─ SentenceModel  │  │ SEO awareness    │
+              │ └─ NLP Analysis   │  │                  │
+              └───────────────────┘  └──────────────────┘
+                            │
+                            ▼
+                    ┌───────────────┐
+                    │  JSON Response │
+                    │  with SEO data

+ 0 - 0
content_quality_tool/__init__.py


BIN
content_quality_tool/__pycache__/__init__.cpython-313.pyc


BIN
content_quality_tool/__pycache__/settings.cpython-313.pyc


BIN
content_quality_tool/__pycache__/urls.cpython-313.pyc


BIN
content_quality_tool/__pycache__/wsgi.cpython-313.pyc


+ 16 - 0
content_quality_tool/asgi.py

@@ -0,0 +1,16 @@
+"""
+ASGI config for content_quality_tool project.
+
+It exposes the ASGI callable as a module-level variable named ``application``.
+
+For more information on this file, see
+https://docs.djangoproject.com/en/5.2/howto/deployment/asgi/
+"""
+
+import os
+
+from django.core.asgi import get_asgi_application
+
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'content_quality_tool.settings')
+
+application = get_asgi_application()

+ 130 - 0
content_quality_tool/settings.py

@@ -0,0 +1,130 @@
+"""
+Django settings for content_quality_tool project.
+
+Generated by 'django-admin startproject' using Django 5.2.7.
+
+For more information on this file, see
+https://docs.djangoproject.com/en/5.2/topics/settings/
+
+For the full list of settings and their values, see
+https://docs.djangoproject.com/en/5.2/ref/settings/
+"""
+
+from pathlib import Path
+import os
+
+
+# Build paths inside the project like this: BASE_DIR / 'subdir'.
+BASE_DIR = Path(__file__).resolve().parent.parent
+
+
+# Quick-start development settings - unsuitable for production
+# See https://docs.djangoproject.com/en/5.2/howto/deployment/checklist/
+
+# SECURITY WARNING: keep the secret key used in production secret!
+SECRET_KEY = 'django-insecure-$6far8v=798or1wru24=zq&k*9&frm+dk%c!*w!a4wfb#z1_+3'
+
+# SECURITY WARNING: don't run with debug turned on in production!
+DEBUG = True
+
+ALLOWED_HOSTS = []
+
+
+# Application definition
+
+INSTALLED_APPS = [
+    'django.contrib.admin',
+    'django.contrib.auth',
+    'django.contrib.contenttypes',
+    'django.contrib.sessions',
+    'django.contrib.messages',
+    'django.contrib.staticfiles',
+    'core',
+]
+
+MIDDLEWARE = [
+    'django.middleware.security.SecurityMiddleware',
+    'django.contrib.sessions.middleware.SessionMiddleware',
+    'django.middleware.common.CommonMiddleware',
+    'django.middleware.csrf.CsrfViewMiddleware',
+    'django.contrib.auth.middleware.AuthenticationMiddleware',
+    'django.contrib.messages.middleware.MessageMiddleware',
+    'django.middleware.clickjacking.XFrameOptionsMiddleware',
+]
+
+ROOT_URLCONF = 'content_quality_tool.urls'
+
+TEMPLATES = [
+    {
+        'BACKEND': 'django.template.backends.django.DjangoTemplates',
+        'DIRS': [],
+        'APP_DIRS': True,
+        'OPTIONS': {
+            'context_processors': [
+                'django.template.context_processors.request',
+                'django.contrib.auth.context_processors.auth',
+                'django.contrib.messages.context_processors.messages',
+            ],
+        },
+    },
+]
+
+WSGI_APPLICATION = 'content_quality_tool.wsgi.application'
+
+
+# Database
+# https://docs.djangoproject.com/en/5.2/ref/settings/#databases
+
+DATABASES = {
+    'default': {
+        'ENGINE': 'django.db.backends.sqlite3',
+        'NAME': BASE_DIR / 'db.sqlite3',
+    }
+}
+
+
+# Password validation
+# https://docs.djangoproject.com/en/5.2/ref/settings/#auth-password-validators
+
+AUTH_PASSWORD_VALIDATORS = [
+    {
+        'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
+    },
+    {
+        'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
+    },
+    {
+        'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
+    },
+    {
+        'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
+    },
+]
+
+
+# Internationalization
+# https://docs.djangoproject.com/en/5.2/topics/i18n/
+
+LANGUAGE_CODE = 'en-us'
+
+TIME_ZONE = 'UTC'
+
+USE_I18N = True
+
+USE_TZ = True
+
+
+# Static files (CSS, JavaScript, Images)
+# https://docs.djangoproject.com/en/5.2/howto/static-files/
+
+STATIC_URL = 'static/'
+
+# Default primary key field type
+# https://docs.djangoproject.com/en/5.2/ref/settings/#default-auto-field
+
+DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
+
+
+# Gemini API Configuration
+GEMINI_API_KEY = os.environ.get('GEMINI_API_KEY', 'AIzaSyC-_MTAmcFwZQeZ36ywpgNnHiSZscmxSOk')
+

+ 25 - 0
content_quality_tool/urls.py

@@ -0,0 +1,25 @@
+"""
+URL configuration for content_quality_tool project.
+
+The `urlpatterns` list routes URLs to views. For more information please see:
+    https://docs.djangoproject.com/en/5.2/topics/http/urls/
+Examples:
+Function views
+    1. Add an import:  from my_app import views
+    2. Add a URL to urlpatterns:  path('', views.home, name='home')
+Class-based views
+    1. Add an import:  from other_app.views import Home
+    2. Add a URL to urlpatterns:  path('', Home.as_view(), name='home')
+Including another URLconf
+    1. Import the include() function: from django.urls import include, path
+    2. Add a URL to urlpatterns:  path('blog/', include('blog.urls'))
+"""
+from django.contrib import admin
+from django.urls import path
+from django.urls import path, include
+
+urlpatterns = [
+    path('admin/', admin.site.urls),
+    path("core/", include("core.urls")),
+]
+

+ 16 - 0
content_quality_tool/wsgi.py

@@ -0,0 +1,16 @@
+"""
+WSGI config for content_quality_tool project.
+
+It exposes the WSGI callable as a module-level variable named ``application``.
+
+For more information on this file, see
+https://docs.djangoproject.com/en/5.2/howto/deployment/wsgi/
+"""
+
+import os
+
+from django.core.wsgi import get_wsgi_application
+
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'content_quality_tool.settings')
+
+application = get_wsgi_application()

+ 0 - 0
core/__init__.py


BIN
core/__pycache__/__init__.cpython-313.pyc


BIN
core/__pycache__/admin.cpython-313.pyc


BIN
core/__pycache__/apps.cpython-313.pyc


BIN
core/__pycache__/models.cpython-313.pyc


BIN
core/__pycache__/urls.cpython-313.pyc


BIN
core/__pycache__/views.cpython-313.pyc


+ 36 - 0
core/admin.py

@@ -0,0 +1,36 @@
+# admin.py
+from django.contrib import admin
+from .models import Product, AttributeScore, CategoryAttributeRule
+
+# -------------------------
+# Product Admin
+# -------------------------
+@admin.register(Product)
+class ProductAdmin(admin.ModelAdmin):
+    list_display = ('sku', 'title', 'category', 'created_at', 'updated_at')
+    search_fields = ('sku', 'title', 'category')
+    list_filter = ('category', 'created_at', 'updated_at')
+    readonly_fields = ('created_at', 'updated_at')
+    ordering = ('-created_at',)
+
+# -------------------------
+# AttributeScore Admin
+# -------------------------
+@admin.register(AttributeScore)
+class AttributeScoreAdmin(admin.ModelAdmin):
+    list_display = ('product', 'score', 'max_score', 'processing_time', 'created_at')
+    search_fields = ('product__sku', 'product__title')
+    list_filter = ('created_at',)
+    readonly_fields = ('created_at',)
+    ordering = ('-created_at',)
+    autocomplete_fields = ('product',)  # makes it easier to select products in large DB
+
+# -------------------------
+# CategoryAttributeRule Admin
+# -------------------------
+@admin.register(CategoryAttributeRule)
+class CategoryAttributeRuleAdmin(admin.ModelAdmin):
+    list_display = ('category', 'attribute_name', 'is_mandatory', 'data_type')
+    search_fields = ('category', 'attribute_name')
+    list_filter = ('category', 'is_mandatory', 'data_type')
+    ordering = ('category', 'attribute_name')

+ 6 - 0
core/apps.py

@@ -0,0 +1,6 @@
+from django.apps import AppConfig
+
+
+class CoreConfig(AppConfig):
+    default_auto_field = 'django.db.models.BigAutoField'
+    name = 'core'

BIN
core/management/commands/__pycache__/load_sample_data.cpython-313.pyc


+ 34 - 0
core/management/commands/load_sample_data.py

@@ -0,0 +1,34 @@
+
+# management/commands/load_sample_data.py
+"""
+Django management command to load sample data
+Run: python manage.py load_sample_data
+"""
+from django.core.management.base import BaseCommand
+from core.models import Product, CategoryAttributeRule
+from data.sample_data import SAMPLE_CATEGORY_RULES, SAMPLE_PRODUCTS
+
+class Command(BaseCommand):
+    help = 'Load sample data for attribute quality scoring'
+    
+    def handle(self, *args, **kwargs):
+        self.stdout.write('Loading sample category rules...')
+        
+        # Clear existing rules
+        CategoryAttributeRule.objects.all().delete()
+        
+        # Load rules
+        for rule in SAMPLE_CATEGORY_RULES:
+            CategoryAttributeRule.objects.create(**rule)
+        
+        self.stdout.write(self.style.SUCCESS(f'Loaded {len(SAMPLE_CATEGORY_RULES)} category rules'))
+        
+        # Load products
+        self.stdout.write('Loading sample products...')
+        Product.objects.all().delete()
+        
+        for prod in SAMPLE_PRODUCTS:
+            Product.objects.create(**prod)
+        
+        self.stdout.write(self.style.SUCCESS(f'Loaded {len(SAMPLE_PRODUCTS)} products'))
+        self.stdout.write(self.style.SUCCESS('Sample data loaded successfully!'))

+ 55 - 0
core/migrations/0001_initial.py

@@ -0,0 +1,55 @@
+# Generated by Django 5.2.7 on 2025-10-07 16:45
+
+import django.db.models.deletion
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    initial = True
+
+    dependencies = [
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='Product',
+            fields=[
+                ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('sku', models.CharField(max_length=100, unique=True)),
+                ('category', models.CharField(max_length=100)),
+                ('title', models.TextField()),
+                ('description', models.TextField(blank=True)),
+                ('attributes', models.JSONField(default=dict)),
+                ('created_at', models.DateTimeField(auto_now_add=True)),
+                ('updated_at', models.DateTimeField(auto_now=True)),
+            ],
+        ),
+        migrations.CreateModel(
+            name='CategoryAttributeRule',
+            fields=[
+                ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('category', models.CharField(max_length=100)),
+                ('attribute_name', models.CharField(max_length=100)),
+                ('is_mandatory', models.BooleanField(default=False)),
+                ('valid_values', models.JSONField(blank=True, default=list)),
+                ('data_type', models.CharField(default='string', max_length=50)),
+            ],
+            options={
+                'unique_together': {('category', 'attribute_name')},
+            },
+        ),
+        migrations.CreateModel(
+            name='AttributeScore',
+            fields=[
+                ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('score', models.FloatField()),
+                ('max_score', models.FloatField(default=100.0)),
+                ('details', models.JSONField(default=dict)),
+                ('issues', models.JSONField(default=list)),
+                ('suggestions', models.JSONField(default=list)),
+                ('created_at', models.DateTimeField(auto_now_add=True)),
+                ('product', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='attribute_scores', to='core.product')),
+            ],
+        ),
+    ]

+ 59 - 0
core/migrations/0002_attributescore_ai_suggestions_and_more.py

@@ -0,0 +1,59 @@
+# Generated by Django 5.2.7 on 2025-10-07 16:58
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0001_initial'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='attributescore',
+            name='ai_suggestions',
+            field=models.JSONField(blank=True, default=dict),
+        ),
+        migrations.AddField(
+            model_name='attributescore',
+            name='processing_time',
+            field=models.FloatField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name='categoryattributerule',
+            name='description',
+            field=models.TextField(blank=True),
+        ),
+        migrations.AddField(
+            model_name='categoryattributerule',
+            name='max_length',
+            field=models.IntegerField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name='categoryattributerule',
+            name='min_length',
+            field=models.IntegerField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name='categoryattributerule',
+            name='validation_regex',
+            field=models.CharField(blank=True, max_length=500),
+        ),
+        migrations.AddIndex(
+            model_name='attributescore',
+            index=models.Index(fields=['-created_at'], name='core_attrib_created_599e4a_idx'),
+        ),
+        migrations.AddIndex(
+            model_name='categoryattributerule',
+            index=models.Index(fields=['category'], name='core_catego_categor_102115_idx'),
+        ),
+        migrations.AddIndex(
+            model_name='product',
+            index=models.Index(fields=['category'], name='core_produc_categor_ba410e_idx'),
+        ),
+        migrations.AddIndex(
+            model_name='product',
+            index=models.Index(fields=['sku'], name='core_produc_sku_a1eae6_idx'),
+        ),
+    ]

+ 0 - 0
core/migrations/__init__.py


BIN
core/migrations/__pycache__/0001_initial.cpython-313.pyc


BIN
core/migrations/__pycache__/0002_attributescore_ai_suggestions_and_more.cpython-313.pyc


BIN
core/migrations/__pycache__/__init__.cpython-313.pyc


+ 64 - 0
core/models.py

@@ -0,0 +1,64 @@
+# models.py
+from django.db import models
+from django.contrib.postgres.fields import JSONField
+import json
+
+class Product(models.Model):
+    """Product model to store basic product information"""
+    sku = models.CharField(max_length=100, unique=True)
+    category = models.CharField(max_length=100)
+    title = models.TextField()
+    description = models.TextField(blank=True)
+    attributes = models.JSONField(default=dict)
+    created_at = models.DateTimeField(auto_now_add=True)
+    updated_at = models.DateTimeField(auto_now=True)
+    
+    class Meta:
+        indexes = [
+            models.Index(fields=['category']),
+            models.Index(fields=['sku']),
+        ]
+    
+    def __str__(self):
+        return f"{self.sku} - {self.title}"
+
+class AttributeScore(models.Model):
+    """Store attribute quality scores"""
+    product = models.ForeignKey(Product, on_delete=models.CASCADE, related_name='attribute_scores')
+    score = models.FloatField()
+    max_score = models.FloatField(default=100.0)
+    details = models.JSONField(default=dict)
+    issues = models.JSONField(default=list)
+    suggestions = models.JSONField(default=list)
+    ai_suggestions = models.JSONField(default=dict, blank=True)  # Gemini AI suggestions
+    processing_time = models.FloatField(null=True, blank=True)
+    created_at = models.DateTimeField(auto_now_add=True)
+    
+    class Meta:
+        indexes = [
+            models.Index(fields=['-created_at']),
+        ]
+    
+    def __str__(self):
+        return f"{self.product.sku} - Score: {self.score}/{self.max_score}"
+
+class CategoryAttributeRule(models.Model):
+    """Define mandatory attributes per category"""
+    category = models.CharField(max_length=100)
+    attribute_name = models.CharField(max_length=100)
+    is_mandatory = models.BooleanField(default=False)
+    valid_values = models.JSONField(default=list, blank=True)
+    data_type = models.CharField(max_length=50, default='string')
+    validation_regex = models.CharField(max_length=500, blank=True)
+    min_length = models.IntegerField(null=True, blank=True)
+    max_length = models.IntegerField(null=True, blank=True)
+    description = models.TextField(blank=True)
+    
+    class Meta:
+        unique_together = ('category', 'attribute_name')
+        indexes = [
+            models.Index(fields=['category']),
+        ]
+    
+    def __str__(self):
+        return f"{self.category} - {self.attribute_name}"

BIN
core/services/__pycache__/attribute_scorer.cpython-313.pyc


BIN
core/services/__pycache__/gemini_service.cpython-313.pyc


BIN
core/services/__pycache__/seo_scorer.cpython-313.pyc


+ 828 - 0
core/services/attribute_scorer.py

@@ -0,0 +1,828 @@
+# # attribute_scorer.py (Enhanced)
+# import re
+# import time
+# from typing import Dict, List, Tuple
+# from rapidfuzz import fuzz, process
+# import spacy
+# from collections import defaultdict
+# import logging
+
+# logger = logging.getLogger(__name__)
+
+# class AttributeQualityScorer:
+#     """
+#     Enhanced scorer with AI-powered suggestions and robust error handling
+#     """
+    
+#     def __init__(self, use_ai: bool = True):
+#         # Load spaCy model
+#         try:
+#             self.nlp = spacy.load("en_core_web_sm")
+#         except Exception as e:
+#             logger.warning(f"spaCy model not loaded: {e}")
+#             self.nlp = None
+        
+#         # Initialize Gemini service
+#         self.use_ai = use_ai
+#         if use_ai:
+#             try:
+#                 from .gemini_service import GeminiAttributeService
+#                 self.ai_service = GeminiAttributeService()
+#             except Exception as e:
+#                 logger.warning(f"Gemini service not available: {e}")
+#                 self.use_ai = False
+#                 self.ai_service = None
+        
+#         # Enhanced weights
+#         self.weights = {
+#             'mandatory_fields': 0.40,
+#             'standardization': 0.30,
+#             'missing_values': 0.20,
+#             'consistency': 0.10
+#         }
+    
+
+#     def score_product(self, product: Dict, category_rules: List[Dict], generate_ai_suggestions: bool = True) -> Dict:
+#         """
+#         Enhanced scoring with AI suggestions and guaranteed AI call
+#         """
+#         start_time = time.time()
+#         attributes = product.get('attributes', {})
+#         category = product.get('category', '')
+
+#         # Initialize scores
+#         scores = {
+#             'mandatory_fields': 0,
+#             'standardization': 0,
+#             'missing_values': 0,
+#             'consistency': 0
+#         }
+
+#         issues = []
+#         suggestions = []
+
+#         # --- Mandatory Fields ---
+#         try:
+#             mandatory_score, mandatory_issues, mandatory_suggestions = self._check_mandatory_fields(attributes, category_rules)
+#             scores['mandatory_fields'] = mandatory_score
+#             issues.extend(mandatory_issues)
+#             suggestions.extend(mandatory_suggestions)
+#         except Exception as e:
+#             logger.error(f"[Mandatory Fields] Error: {e}")
+#             scores['mandatory_fields'] = 0
+
+#         # --- Standardization ---
+#         try:
+#             std_score, std_issues, std_suggestions = self._check_standardization(attributes, category_rules)
+#             scores['standardization'] = std_score
+#             issues.extend(std_issues)
+#             suggestions.extend(std_suggestions)
+#         except Exception as e:
+#             logger.error(f"[Standardization] Error: {e}")
+#             scores['standardization'] = 0
+
+#         # --- Missing Values ---
+#         try:
+#             missing_score, missing_issues, missing_suggestions = self._check_missing_values(attributes, category_rules)
+#             scores['missing_values'] = missing_score
+#             issues.extend(missing_issues)
+#             suggestions.extend(missing_suggestions)
+#         except Exception as e:
+#             logger.error(f"[Missing Values] Error: {e}")
+#             scores['missing_values'] = 0
+
+#         # --- Consistency ---
+#         try:
+#             consistency_score, consistency_issues = self._check_consistency(attributes, product.get('title', ''), product.get('description', ''))
+#             scores['consistency'] = consistency_score
+#             issues.extend(consistency_issues)
+#         except Exception as e:
+#             logger.error(f"[Consistency] Error: {e}")
+#             scores['consistency'] = 0
+
+#         # --- Final Score ---
+#         final_score = sum(scores[key] * self.weights[key] for key in scores)
+
+#         result = {
+#             'final_score': round(final_score, 2),
+#             'max_score': 100.0,
+#             'breakdown': scores,
+#             'issues': issues,
+#             'suggestions': suggestions,
+#             'weights': self.weights,
+#             'processing_time': round(time.time() - start_time, 3)
+#         }
+
+#         # --- AI Suggestions (Guaranteed Attempt) ---
+#         if generate_ai_suggestions and self.use_ai:
+#             try:
+#                 logger.info(f"Generating AI suggestions for SKU: {product.get('sku')}, issues count: {len(issues)}")
+#                 ai_suggestions = self.ai_service.generate_attribute_suggestions(
+#                     product,
+#                     issues,
+#                     category_rules
+#                 )
+#                 if not ai_suggestions:
+#                     logger.warning(f"AI service returned empty suggestions for SKU: {product.get('sku')}")
+#                     ai_suggestions = {"note": "No AI suggestions generated"}
+#                 result['ai_suggestions'] = ai_suggestions
+#             except Exception as e:
+#                 logger.error(f"[AI Suggestions] Error: {e}")
+#                 result['ai_suggestions'] = {'error': str(e)}
+
+#         return result
+
+
+
+
+    
+#     def _check_mandatory_fields(self, attributes: Dict, rules: List[Dict]) -> Tuple[float, List, List]:
+#         """Enhanced mandatory field validation with detailed feedback"""
+#         mandatory_rules = [r for r in rules if r.get('is_mandatory', False)]
+        
+#         if not mandatory_rules:
+#             return 100.0, [], []
+        
+#         present_count = 0
+#         issues = []
+#         suggestions = []
+        
+#         for rule in mandatory_rules:
+#             attr_name = rule['attribute_name']
+            
+#             # Check presence and validity
+#             if attr_name in attributes and attributes[attr_name]:
+#                 value = str(attributes[attr_name]).strip()
+                
+#                 if not value:
+#                     issues.append(f"Mandatory field '{attr_name}' is empty")
+#                     suggestions.append(f"Provide a non-empty value for {attr_name}")
+#                     continue
+                
+#                 # Check length constraints
+#                 min_len = rule.get('min_length')
+#                 max_len = rule.get('max_length')
+                
+#                 if min_len and len(value) < min_len:
+#                     issues.append(f"'{attr_name}' too short (min: {min_len} chars)")
+#                     suggestions.append(f"Expand {attr_name} to at least {min_len} characters")
+#                     continue
+                
+#                 if max_len and len(value) > max_len:
+#                     issues.append(f"'{attr_name}' too long (max: {max_len} chars)")
+#                     suggestions.append(f"Shorten {attr_name} to {max_len} characters or less")
+#                     continue
+                
+#                 # Check regex pattern
+#                 regex = rule.get('validation_regex')
+#                 if regex and not re.match(regex, value):
+#                     issues.append(f"'{attr_name}' format invalid")
+#                     suggestions.append(f"Ensure {attr_name} matches required format")
+#                     continue
+                
+#                 present_count += 1
+#             else:
+#                 issues.append(f"Missing mandatory field: {attr_name}")
+#                 desc = rule.get('description', '')
+#                 if desc:
+#                     suggestions.append(f"Add {attr_name}: {desc}")
+#                 else:
+#                     suggestions.append(f"Add required attribute: {attr_name}")
+        
+#         score = (present_count / len(mandatory_rules)) * 100 if mandatory_rules else 100.0
+#         return score, issues, suggestions
+    
+#     def _check_standardization(self, attributes: Dict, rules: List[Dict]) -> Tuple[float, List, List]:
+#         """Enhanced standardization with better fuzzy matching"""
+#         standardized_rules = [r for r in rules if r.get('valid_values')]
+        
+#         if not standardized_rules:
+#             return 100.0, [], []
+        
+#         correct_count = 0
+#         issues = []
+#         suggestions = []
+        
+#         for rule in standardized_rules:
+#             attr_name = rule['attribute_name']
+#             valid_values = rule['valid_values']
+            
+#             if attr_name not in attributes or not attributes[attr_name]:
+#                 continue
+            
+#             actual_value = str(attributes[attr_name]).strip()
+            
+#             if not actual_value:
+#                 continue
+            
+#             # Exact match (case-insensitive)
+#             if actual_value in valid_values:
+#                 correct_count += 1
+#                 continue
+            
+#             # Case-insensitive match
+#             lower_valid = {v.lower(): v for v in valid_values}
+#             if actual_value.lower() in lower_valid:
+#                 correct_count += 1
+#                 if actual_value != lower_valid[actual_value.lower()]:
+#                     issues.append(f"{attr_name}: Case mismatch - '{actual_value}' should be '{lower_valid[actual_value.lower()]}'")
+#                     suggestions.append(f"Correct capitalization of {attr_name} to: {lower_valid[actual_value.lower()]}")
+#                 continue
+            
+#             # Fuzzy matching with multiple scorers
+#             best_match = None
+#             best_score = 0
+            
+#             for scorer in [fuzz.ratio, fuzz.partial_ratio, fuzz.token_sort_ratio]:
+#                 match = process.extractOne(actual_value, valid_values, scorer=scorer)
+#                 if match and match[1] > best_score:
+#                     best_match = match
+#                     best_score = match[1]
+            
+#             if best_match and best_score >= 80:
+#                 correct_count += 1
+#                 if best_score < 100:
+#                     issues.append(f"{attr_name}: '{actual_value}' likely means '{best_match[0]}' (confidence: {best_score}%)")
+#                     suggestions.append(f"Standardize {attr_name} to: {best_match[0]}")
+#             else:
+#                 issues.append(f"{attr_name}: '{actual_value}' not recognized. Valid: {', '.join(valid_values[:5])}")
+#                 suggestions.append(f"Change {attr_name} to one of: {', '.join(valid_values[:3])}")
+        
+#         score = (correct_count / len(standardized_rules)) * 100 if standardized_rules else 100.0
+#         return score, issues, suggestions
+    
+#     def _check_missing_values(self, attributes: Dict, rules: List[Dict]) -> Tuple[float, List, List]:
+#         """Enhanced placeholder detection"""
+#         placeholder_patterns = [
+#             r'^n/?a$', r'^none$', r'^null$', r'^-+$', r'^\.+$', 
+#             r'^tbd$', r'^to be determined$', r'^unknown$', r'^na$',
+#             r'^todo$', r'^pending$', r'^\?+$', r'^xxx+$', r'^placeholder$'
+#         ]
+        
+#         total_attrs = len(rules)
+#         valid_count = 0
+#         issues = []
+#         suggestions = []
+        
+#         for rule in rules:
+#             attr_name = rule['attribute_name']
+            
+#             if attr_name not in attributes:
+#                 continue
+            
+#             value = str(attributes[attr_name]).strip()
+            
+#             # Check if empty
+#             if not value:
+#                 issues.append(f"'{attr_name}' is empty")
+#                 suggestions.append(f"Provide a valid value for {attr_name}")
+#                 continue
+            
+#             # Check if placeholder
+#             value_lower = value.lower()
+#             is_placeholder = any(re.match(pattern, value_lower, re.IGNORECASE) for pattern in placeholder_patterns)
+            
+#             if is_placeholder:
+#                 issues.append(f"'{attr_name}' contains placeholder: '{value}'")
+#                 suggestions.append(f"Replace placeholder in {attr_name} with actual data")
+#                 continue
+            
+#             # Check for suspicious patterns
+#             if len(value) < 2 and rule.get('is_mandatory'):
+#                 issues.append(f"'{attr_name}' suspiciously short: '{value}'")
+#                 suggestions.append(f"Provide more detailed {attr_name}")
+#                 continue
+            
+#             valid_count += 1
+        
+#         score = (valid_count / total_attrs) * 100 if total_attrs > 0 else 100.0
+#         return score, issues, suggestions
+    
+#     def _check_consistency(self, attributes: Dict, title: str, description: str) -> Tuple[float, List]:
+#         """Enhanced consistency checking with context awareness"""
+#         issues = []
+#         consistency_count = 0
+#         total_checks = 0
+        
+#         check_attrs = ['brand', 'color', 'size', 'material', 'model', 'weight', 'dimensions']
+#         combined_text = f"{title} {description}".lower()
+        
+#         for attr in check_attrs:
+#             if attr not in attributes or not attributes[attr]:
+#                 continue
+            
+#             total_checks += 1
+#             attr_value = str(attributes[attr]).lower().strip()
+            
+#             # Skip very short values
+#             if len(attr_value) < 2:
+#                 consistency_count += 1
+#                 continue
+            
+#             # Direct substring match
+#             if attr_value in combined_text:
+#                 consistency_count += 1
+#                 continue
+            
+#             # Word boundary match
+#             words_in_text = set(combined_text.split())
+#             words_in_attr = set(attr_value.split())
+            
+#             if words_in_attr.issubset(words_in_text):
+#                 consistency_count += 1
+#                 continue
+            
+#             # Fuzzy word matching
+#             text_words = combined_text.split()
+#             matches = 0
+#             for attr_word in words_in_attr:
+#                 match = process.extractOne(attr_word, text_words, scorer=fuzz.ratio)
+#                 if match and match[1] >= 80:
+#                     matches += 1
+            
+#             if matches / len(words_in_attr) >= 0.7:
+#                 consistency_count += 1
+#                 continue
+            
+#             issues.append(f"'{attr.title()}': '{attributes[attr]}' not clearly mentioned in title/description")
+        
+#         score = (consistency_count / total_checks) * 100 if total_checks > 0 else 100.0
+#         return score, issues
+    
+#     def extract_attributes_from_text(self, text: str, category: str = '') -> Dict:
+#         """Enhanced attribute extraction with AI fallback"""
+#         extracted = {}
+        
+#         # Try spaCy first
+#         if self.nlp:
+#             extracted = self._extract_with_spacy(text)
+        
+#         # Use AI if available and spaCy found little
+#         if self.use_ai and len(extracted) < 3:
+#             try:
+#                 ai_extracted = self.ai_service.extract_attributes_with_ai(text, '', category)
+#                 extracted.update({k: v for k, v in ai_extracted.items() if v})
+#             except Exception as e:
+#                 logger.error(f"AI extraction failed: {e}")
+        
+#         return extracted
+    
+#     def _extract_with_spacy(self, text: str) -> Dict:
+#         """Extract using spaCy NER"""
+#         doc = self.nlp(text)
+#         extracted = defaultdict(list)
+        
+#         for ent in doc.ents:
+#             if ent.label_ == 'ORG':
+#                 extracted['brand'].append(ent.text)
+#             elif ent.label_ == 'QUANTITY':
+#                 extracted['size'].append(ent.text)
+#             elif ent.label_ == 'PRODUCT':
+#                 extracted['product_type'].append(ent.text)
+        
+#         # Color detection
+#         colors = ['red', 'blue', 'green', 'black', 'white', 'yellow', 'orange', 
+#                  'purple', 'pink', 'brown', 'gray', 'grey', 'silver', 'gold']
+#         text_lower = text.lower()
+#         for color in colors:
+#             if color in text_lower:
+#                 extracted['color'].append(color.title())
+        
+#         # Return most common value
+#         result = {}
+#         for key, values in extracted.items():
+#             if values:
+#                 result[key] = max(set(values), key=values.count)
+        
+#         return result
+
+
+
+
+# attribute_scorer.py (Enhanced with SEO)
+import re
+import time
+from typing import Dict, List, Tuple
+from rapidfuzz import fuzz, process
+import spacy
+from collections import defaultdict
+import logging
+
+logger = logging.getLogger(__name__)
+
+class AttributeQualityScorer:
+    """
+    Enhanced scorer with AI-powered suggestions, robust error handling, and SEO scoring
+    """
+    
+    def __init__(self, use_ai: bool = True, use_seo: bool = True):
+        # Load spaCy model
+        try:
+            self.nlp = spacy.load("en_core_web_sm")
+        except Exception as e:
+            logger.warning(f"spaCy model not loaded: {e}")
+            self.nlp = None
+        
+        # Initialize Gemini service
+        self.use_ai = use_ai
+        if use_ai:
+            try:
+                from .gemini_service import GeminiAttributeService
+                self.ai_service = GeminiAttributeService()
+            except Exception as e:
+                logger.warning(f"Gemini service not available: {e}")
+                self.use_ai = False
+                self.ai_service = None
+        
+        # Initialize SEO scorer
+        self.use_seo = use_seo
+        if use_seo:
+            try:
+                from .seo_scorer import SEODiscoverabilityScorer
+                self.seo_scorer = SEODiscoverabilityScorer()
+            except Exception as e:
+                logger.warning(f"SEO scorer not available: {e}")
+                self.use_seo = False
+                self.seo_scorer = None
+        
+        # Updated weights to include SEO (total = 100%)
+        self.weights = {
+            'mandatory_fields': 0.34,    # Reduced from 40% -> 34%
+            'standardization': 0.26,     # Reduced from 30% -> 26%
+            'missing_values': 0.17,      # Reduced from 20% -> 17%
+            'consistency': 0.08,         # Reduced from 10% -> 8%
+            'seo_discoverability': 0.15  # NEW: 15%
+        }
+    
+
+    def score_product(self, product: Dict, category_rules: List[Dict], generate_ai_suggestions: bool = True) -> Dict:
+        """
+        Enhanced scoring with AI suggestions, SEO scoring, and guaranteed AI call
+        """
+        start_time = time.time()
+        attributes = product.get('attributes', {})
+        category = product.get('category', '')
+
+        # Initialize scores
+        scores = {
+            'mandatory_fields': 0,
+            'standardization': 0,
+            'missing_values': 0,
+            'consistency': 0,
+            'seo_discoverability': 0
+        }
+
+        issues = []
+        suggestions = []
+
+        # --- Mandatory Fields ---
+        try:
+            mandatory_score, mandatory_issues, mandatory_suggestions = self._check_mandatory_fields(attributes, category_rules)
+            scores['mandatory_fields'] = mandatory_score
+            issues.extend(mandatory_issues)
+            suggestions.extend(mandatory_suggestions)
+        except Exception as e:
+            logger.error(f"[Mandatory Fields] Error: {e}")
+            scores['mandatory_fields'] = 0
+
+        # --- Standardization ---
+        try:
+            std_score, std_issues, std_suggestions = self._check_standardization(attributes, category_rules)
+            scores['standardization'] = std_score
+            issues.extend(std_issues)
+            suggestions.extend(std_suggestions)
+        except Exception as e:
+            logger.error(f"[Standardization] Error: {e}")
+            scores['standardization'] = 0
+
+        # --- Missing Values ---
+        try:
+            missing_score, missing_issues, missing_suggestions = self._check_missing_values(attributes, category_rules)
+            scores['missing_values'] = missing_score
+            issues.extend(missing_issues)
+            suggestions.extend(missing_suggestions)
+        except Exception as e:
+            logger.error(f"[Missing Values] Error: {e}")
+            scores['missing_values'] = 0
+
+        # --- Consistency ---
+        try:
+            consistency_score, consistency_issues = self._check_consistency(attributes, product.get('title', ''), product.get('description', ''))
+            scores['consistency'] = consistency_score
+            issues.extend(consistency_issues)
+        except Exception as e:
+            logger.error(f"[Consistency] Error: {e}")
+            scores['consistency'] = 0
+
+        # --- SEO & Discoverability (NEW) ---
+        seo_result = None
+        if self.use_seo and self.seo_scorer:
+            try:
+                seo_result = self.seo_scorer.score_seo(product, category_rules)
+                scores['seo_discoverability'] = seo_result['seo_score']
+                issues.extend(seo_result['issues'])
+                suggestions.extend(seo_result['suggestions'])
+            except Exception as e:
+                logger.error(f"[SEO Scoring] Error: {e}")
+                scores['seo_discoverability'] = 0
+
+        # --- Final Score ---
+        final_score = sum(scores[key] * self.weights[key] for key in scores)
+
+        result = {
+            'final_score': round(final_score, 2),
+            'max_score': 100.0,
+            'breakdown': scores,
+            'issues': issues,
+            'suggestions': suggestions,
+            'weights': self.weights,
+            'processing_time': round(time.time() - start_time, 3)
+        }
+
+        # Add SEO-specific details
+        if seo_result:
+            result['seo_details'] = {
+                'breakdown': seo_result['breakdown'],
+                'extracted_keywords': seo_result.get('extracted_keywords', []),
+                'missing_high_value_terms': seo_result.get('missing_high_value_terms', [])
+            }
+
+        # --- AI Suggestions (Guaranteed Attempt) ---
+        if generate_ai_suggestions and self.use_ai:
+            try:
+                logger.info(f"Generating AI suggestions for SKU: {product.get('sku')}, issues count: {len(issues)}")
+                ai_suggestions = self.ai_service.generate_attribute_suggestions(
+                    product,
+                    issues,
+                    category_rules
+                )
+                if not ai_suggestions:
+                    logger.warning(f"AI service returned empty suggestions for SKU: {product.get('sku')}")
+                    ai_suggestions = {"note": "No AI suggestions generated"}
+                result['ai_suggestions'] = ai_suggestions
+            except Exception as e:
+                logger.error(f"[AI Suggestions] Error: {e}")
+                result['ai_suggestions'] = {'error': str(e)}
+
+        return result
+
+    
+    def _check_mandatory_fields(self, attributes: Dict, rules: List[Dict]) -> Tuple[float, List, List]:
+        """Enhanced mandatory field validation with detailed feedback"""
+        mandatory_rules = [r for r in rules if r.get('is_mandatory', False)]
+        
+        if not mandatory_rules:
+            return 100.0, [], []
+        
+        present_count = 0
+        issues = []
+        suggestions = []
+        
+        for rule in mandatory_rules:
+            attr_name = rule['attribute_name']
+            
+            # Check presence and validity
+            if attr_name in attributes and attributes[attr_name]:
+                value = str(attributes[attr_name]).strip()
+                
+                if not value:
+                    issues.append(f"Mandatory field '{attr_name}' is empty")
+                    suggestions.append(f"Provide a non-empty value for {attr_name}")
+                    continue
+                
+                # Check length constraints
+                min_len = rule.get('min_length')
+                max_len = rule.get('max_length')
+                
+                if min_len and len(value) < min_len:
+                    issues.append(f"'{attr_name}' too short (min: {min_len} chars)")
+                    suggestions.append(f"Expand {attr_name} to at least {min_len} characters")
+                    continue
+                
+                if max_len and len(value) > max_len:
+                    issues.append(f"'{attr_name}' too long (max: {max_len} chars)")
+                    suggestions.append(f"Shorten {attr_name} to {max_len} characters or less")
+                    continue
+                
+                # Check regex pattern
+                regex = rule.get('validation_regex')
+                if regex and not re.match(regex, value):
+                    issues.append(f"'{attr_name}' format invalid")
+                    suggestions.append(f"Ensure {attr_name} matches required format")
+                    continue
+                
+                present_count += 1
+            else:
+                issues.append(f"Missing mandatory field: {attr_name}")
+                desc = rule.get('description', '')
+                if desc:
+                    suggestions.append(f"Add {attr_name}: {desc}")
+                else:
+                    suggestions.append(f"Add required attribute: {attr_name}")
+        
+        score = (present_count / len(mandatory_rules)) * 100 if mandatory_rules else 100.0
+        return score, issues, suggestions
+    
+    def _check_standardization(self, attributes: Dict, rules: List[Dict]) -> Tuple[float, List, List]:
+        """Enhanced standardization with better fuzzy matching"""
+        standardized_rules = [r for r in rules if r.get('valid_values')]
+        
+        if not standardized_rules:
+            return 100.0, [], []
+        
+        correct_count = 0
+        issues = []
+        suggestions = []
+        
+        for rule in standardized_rules:
+            attr_name = rule['attribute_name']
+            valid_values = rule['valid_values']
+            
+            if attr_name not in attributes or not attributes[attr_name]:
+                continue
+            
+            actual_value = str(attributes[attr_name]).strip()
+            
+            if not actual_value:
+                continue
+            
+            # Exact match (case-insensitive)
+            if actual_value in valid_values:
+                correct_count += 1
+                continue
+            
+            # Case-insensitive match
+            lower_valid = {v.lower(): v for v in valid_values}
+            if actual_value.lower() in lower_valid:
+                correct_count += 1
+                if actual_value != lower_valid[actual_value.lower()]:
+                    issues.append(f"{attr_name}: Case mismatch - '{actual_value}' should be '{lower_valid[actual_value.lower()]}'")
+                    suggestions.append(f"Correct capitalization of {attr_name} to: {lower_valid[actual_value.lower()]}")
+                continue
+            
+            # Fuzzy matching with multiple scorers
+            best_match = None
+            best_score = 0
+            
+            for scorer in [fuzz.ratio, fuzz.partial_ratio, fuzz.token_sort_ratio]:
+                match = process.extractOne(actual_value, valid_values, scorer=scorer)
+                if match and match[1] > best_score:
+                    best_match = match
+                    best_score = match[1]
+            
+            if best_match and best_score >= 80:
+                correct_count += 1
+                if best_score < 100:
+                    issues.append(f"{attr_name}: '{actual_value}' likely means '{best_match[0]}' (confidence: {best_score}%)")
+                    suggestions.append(f"Standardize {attr_name} to: {best_match[0]}")
+            else:
+                issues.append(f"{attr_name}: '{actual_value}' not recognized. Valid: {', '.join(valid_values[:5])}")
+                suggestions.append(f"Change {attr_name} to one of: {', '.join(valid_values[:3])}")
+        
+        score = (correct_count / len(standardized_rules)) * 100 if standardized_rules else 100.0
+        return score, issues, suggestions
+    
+    def _check_missing_values(self, attributes: Dict, rules: List[Dict]) -> Tuple[float, List, List]:
+        """Enhanced placeholder detection"""
+        placeholder_patterns = [
+    r'^n/?a', r'^none', r'^null', r'^-+', r'^\.+' 
+    r'^tbd', r'^to be determined', r'^unknown', r'^na', 
+    r'^todo', r'^pending', r'^\?+', r'^xxx+', r'^placeholder'
+]
+        
+        total_attrs = len(rules)
+        valid_count = 0
+        issues = []
+        suggestions = []
+        
+        for rule in rules:
+            attr_name = rule['attribute_name']
+            
+            if attr_name not in attributes:
+                continue
+            
+            value = str(attributes[attr_name]).strip()
+            
+            # Check if empty
+            if not value:
+                issues.append(f"'{attr_name}' is empty")
+                suggestions.append(f"Provide a valid value for {attr_name}")
+                continue
+            
+            # Check if placeholder
+            value_lower = value.lower()
+            is_placeholder = any(re.match(pattern, value_lower, re.IGNORECASE) for pattern in placeholder_patterns)
+            
+            if is_placeholder:
+                issues.append(f"'{attr_name}' contains placeholder: '{value}'")
+                suggestions.append(f"Replace placeholder in {attr_name} with actual data")
+                continue
+            
+            # Check for suspicious patterns
+            if len(value) < 2 and rule.get('is_mandatory'):
+                issues.append(f"'{attr_name}' suspiciously short: '{value}'")
+                suggestions.append(f"Provide more detailed {attr_name}")
+                continue
+            
+            valid_count += 1
+        
+        score = (valid_count / total_attrs) * 100 if total_attrs > 0 else 100.0
+        return score, issues, suggestions
+    
+    def _check_consistency(self, attributes: Dict, title: str, description: str) -> Tuple[float, List]:
+        """Enhanced consistency checking with context awareness"""
+        issues = []
+        consistency_count = 0
+        total_checks = 0
+        
+        check_attrs = ['brand', 'color', 'size', 'material', 'model', 'weight', 'dimensions']
+        combined_text = f"{title} {description}".lower()
+        
+        for attr in check_attrs:
+            if attr not in attributes or not attributes[attr]:
+                continue
+            
+            total_checks += 1
+            attr_value = str(attributes[attr]).lower().strip()
+            
+            # Skip very short values
+            if len(attr_value) < 2:
+                consistency_count += 1
+                continue
+            
+            # Direct substring match
+            if attr_value in combined_text:
+                consistency_count += 1
+                continue
+            
+            # Word boundary match
+            words_in_text = set(combined_text.split())
+            words_in_attr = set(attr_value.split())
+            
+            if words_in_attr.issubset(words_in_text):
+                consistency_count += 1
+                continue
+            
+            # Fuzzy word matching
+            text_words = combined_text.split()
+            matches = 0
+            for attr_word in words_in_attr:
+                match = process.extractOne(attr_word, text_words, scorer=fuzz.ratio)
+                if match and match[1] >= 80:
+                    matches += 1
+            
+            if matches / len(words_in_attr) >= 0.7:
+                consistency_count += 1
+                continue
+            
+            issues.append(f"'{attr.title()}': '{attributes[attr]}' not clearly mentioned in title/description")
+        
+        score = (consistency_count / total_checks) * 100 if total_checks > 0 else 100.0
+        return score, issues
+    
+    def extract_attributes_from_text(self, text: str, category: str = '') -> Dict:
+        """Enhanced attribute extraction with AI fallback"""
+        extracted = {}
+        
+        # Try spaCy first
+        if self.nlp:
+            extracted = self._extract_with_spacy(text)
+        
+        # Use AI if available and spaCy found little
+        if self.use_ai and len(extracted) < 3:
+            try:
+                ai_extracted = self.ai_service.extract_attributes_with_ai(text, '', category)
+                extracted.update({k: v for k, v in ai_extracted.items() if v})
+            except Exception as e:
+                logger.error(f"AI extraction failed: {e}")
+        
+        return extracted
+    
+    def _extract_with_spacy(self, text: str) -> Dict:
+        """Extract using spaCy NER"""
+        doc = self.nlp(text)
+        extracted = defaultdict(list)
+        
+        for ent in doc.ents:
+            if ent.label_ == 'ORG':
+                extracted['brand'].append(ent.text)
+            elif ent.label_ == 'QUANTITY':
+                extracted['size'].append(ent.text)
+            elif ent.label_ == 'PRODUCT':
+                extracted['product_type'].append(ent.text)
+        
+        # Color detection
+        colors = ['red', 'blue', 'green', 'black', 'white', 'yellow', 'orange', 
+                 'purple', 'pink', 'brown', 'gray', 'grey', 'silver', 'gold']
+        text_lower = text.lower()
+        for color in colors:
+            if color in text_lower:
+                extracted['color'].append(color.title())
+        
+        # Return most common value
+        result = {}
+        for key, values in extracted.items():
+            if values:
+                result[key] = max(set(values), key=values.count)
+        
+        return result

+ 1084 - 0
core/services/gemini_service.py

@@ -0,0 +1,1084 @@
+
+
+# import google.generativeai as genai
+# import json
+# import logging
+# from typing import Dict, List
+# from django.conf import settings
+# from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
+
+# logger = logging.getLogger(__name__)
+
+# class GeminiAttributeService:
+#     """Service to interact with Google Gemini API for attribute suggestions"""
+    
+#     def __init__(self):
+#         # Configure Gemini API
+#         api_key = getattr(settings, 'GEMINI_API_KEY', None)
+#         if not api_key:
+#             raise ValueError("GEMINI_API_KEY not found in settings")
+#         genai.configure(api_key=api_key)
+#         self.model = genai.GenerativeModel('gemini-2.5-flash') 
+        
+#     @retry(
+#         stop=stop_after_attempt(3),
+#         wait=wait_exponential(multiplier=1, min=2, max=10),
+#         retry=retry_if_exception_type(Exception),
+#         before_sleep=lambda retry_state: logger.info(f"Retrying Gemini API call, attempt {retry_state.attempt_number}")
+#     )
+#     def _call_gemini_api(self, prompt):
+#         """Helper method to call Gemini API with retry logic"""
+#         return self.model.generate_content(
+#             prompt,
+#             generation_config=genai.types.GenerationConfig(
+#                 temperature=0.3,
+#                 top_p=0.95,
+#                 top_k=40,
+#                 max_output_tokens=4096  # Increased to handle complex responses
+#             ),
+#             safety_settings={
+#                 genai.types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: genai.types.HarmBlockThreshold.BLOCK_NONE,
+#                 genai.types.HarmCategory.HARM_CATEGORY_HARASSMENT: genai.types.HarmBlockThreshold.BLOCK_NONE,
+#                 genai.types.HarmCategory.HARM_CATEGORY_HATE_SPEECH: genai.types.HarmBlockThreshold.BLOCK_NONE,
+#                 genai.types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: genai.types.HarmBlockThreshold.BLOCK_NONE
+#             }
+#         )
+
+
+#     def generate_attribute_suggestions(
+#         self, 
+#         product: Dict, 
+#         issues: List[str], 
+#         category_rules: List[Dict]
+#     ) -> Dict:
+#         """
+#         Use Gemini to generate intelligent suggestions for fixing attribute issues
+#         ...
+#         """
+#         try:
+#             prompt = self._build_prompt(product, issues, category_rules)
+#             response = self._call_gemini_api(prompt)
+#             print(f"response is {response}")
+            
+#             if not response.candidates:
+#                 logger.error(f"No candidates returned. Response: {response}")
+#                 return {
+#                     'error': 'No candidates returned by Gemini API',
+#                     'fallback_suggestions': self._generate_fallback_suggestions(issues)
+#                 }
+            
+#             candidate = response.candidates[0]
+            
+#             # --- START FIX: Use candidate.finish_reason.name for robust check ---
+#             # Check if the finish reason indicates a block (e.g., SAFETY, OTHER, RECITATION)
+#             finish_reason_name = candidate.finish_reason.name
+            
+#             if finish_reason_name in ("SAFETY", "RECITATION", "OTHER"): # Add other block reasons as needed
+#                 logger.error(f"Response blocked by {finish_reason_name}. Safety ratings: {candidate.safety_ratings}")
+#                 return {
+#                     'error': f'Response blocked by {finish_reason_name} filters',
+#                     'safety_ratings': [
+#                         {'category': str(r.category), 'probability': str(r.probability)}
+#                         for r in candidate.safety_ratings
+#                     ],
+#                     'fallback_suggestions': self._generate_fallback_suggestions(issues)
+#                 }
+#             # --- END FIX ---
+            
+#             logger.info(f"Raw response: {response.text[:500]}...")
+#             suggestions = self._parse_response(response.text)
+#             logger.info(f"Parsed suggestions: {suggestions}")
+
+#             return suggestions
+                
+#         except Exception as e:
+#             logger.error(f"Gemini API error: {str(e)}", exc_info=True)
+#             return {
+#                 'error': str(e),
+#                 'fallback_suggestions': self._generate_fallback_suggestions(issues)
+#             }
+
+
+
+#     def _build_prompt(self, product: Dict, issues: List[str], rules: List[Dict]) -> str:
+#         """Build a structured prompt for Gemini"""
+#         mandatory_attrs = [r['attribute_name'] for r in rules if r.get('is_mandatory')]
+#         valid_values_map = {
+#             r['attribute_name']: r.get('valid_values', []) 
+#             for r in rules if r.get('valid_values')
+#         }
+        
+#         # Sanitize issues to avoid ambiguous phrasing
+#         cleaned_issues = [
+#             issue.replace("suspiciously short", "may need more detail")
+#                  .replace("not recognized", "not in valid values")
+#                  .replace("likely means", "recommended correction")
+#                  .replace("not clearly mentioned", "missing from title/description")
+#             for issue in issues
+#         ]
+        
+#         prompt = f"""You are an expert e-commerce product data analyst specializing in clothing products. All input data is safe, non-sensitive, and related to clothing product attributes. Your task is to analyze product attributes and provide specific, actionable suggestions to fix identified issues, ensuring compliance with category rules.
+
+# PRODUCT INFORMATION:
+# - SKU: {product.get('sku', 'N/A')}
+# - Category: {product.get('category', 'N/A')}
+# - Title: {product.get('title', 'N/A')}
+# - Description: {product.get('description', 'N/A')}
+# - Current Attributes: {json.dumps(product.get('attributes', {}), indent=2)}
+
+# CATEGORY RULES:
+# - Mandatory Attributes: {', '.join(mandatory_attrs) or 'None'}
+# - Valid Values: {json.dumps(valid_values_map, indent=2) or '{}'}
+
+# DETECTED ISSUES:
+# {chr(10).join(f"- {issue}" for issue in cleaned_issues) or '- None'}
+
+# TASK:
+# Analyze the product data and issues. Provide specific suggestions to fix each issue and extract missing attributes from the title or description. Ensure all suggestions are relevant to clothing products and formatted as valid JSON.
+
+# OUTPUT FORMAT (return valid JSON only):
+# {{
+#   "corrected_attributes": {{
+#     "attribute_name": "suggested_value"
+#   }},
+#   "missing_attributes": {{
+#     "attribute_name": "extracted_value"
+#   }},
+#   "improvements": [
+#     {{
+#       "issue": "description of the issue",
+#       "suggestion": "specific action to take",
+#       "confidence": "high/medium/low"
+#     }}
+#   ],
+#   "quality_score_prediction": integer,
+#   "reasoning": "Brief explanation of suggested changes"
+# }}"""
+
+#         logger.info(f"Generated prompt (length: {len(prompt)} chars, ~{len(prompt)//4} tokens): {prompt}")
+#         return prompt
+
+#     def _parse_response(self, response_text: str) -> Dict:
+#         """Parse Gemini's response and extract JSON"""
+#         try:
+#             # Remove markdown code blocks and language identifier
+#             cleaned = response_text.strip()
+#             if cleaned.startswith('```'):
+#                 cleaned = cleaned.split('```')[1].strip()
+#                 if cleaned.startswith('json'):
+#                     cleaned = cleaned[4:].strip()
+            
+#             # Attempt to parse JSON
+#             parsed = json.loads(cleaned)
+#             return parsed
+#         except json.JSONDecodeError as e:
+#             logger.error(f"Failed to parse Gemini response: {e}")
+#             logger.error(f"Response was: {response_text[:1000]}...")
+#             # Attempt to fix partial JSON
+#             try:
+#                 # Truncate at last valid closing brace
+#                 last_valid = cleaned.rfind('}')
+#                 if last_valid != -1:
+#                     partial_json = cleaned[:last_valid + 1]
+#                     parsed = json.loads(partial_json)
+#                     logger.warning("Parsed partial JSON response")
+#                     return parsed
+#             except json.JSONDecodeError:
+#                 logger.error("Could not parse partial JSON")
+#             return {
+#                 'error': 'Failed to parse AI response',
+#                 'raw_response': response_text,
+#                 'fallback_suggestions': []
+#             }
+
+#     def _generate_fallback_suggestions(self, issues: List[str]) -> List[Dict]:
+#         """Generate enhanced fallback suggestions based on issues"""
+#         suggestions = []
+#         for issue in issues:
+#             suggestion = "Please review and correct this issue manually"
+#             confidence = "low"
+            
+#             # Specific suggestions for common issues
+#             if "Missing mandatory field" in issue:
+#                 attr = issue.split("Missing mandatory field: ")[-1]
+#                 suggestion = f"Provide a valid value for {attr} (e.g., extract from title/description or use a common value like 'Black' for color)"
+#                 confidence = "medium"
+#             elif "not in valid values" in issue:
+#                 attr = issue.split(":")[0].strip()
+#                 suggestion = f"Choose a valid value for {attr} (e.g., XS, S, M, L, XL for size)"
+#                 confidence = "medium"
+#             elif "contains placeholder" in issue:
+#                 attr = issue.split("'")[1]
+#                 suggestion = f"Replace the placeholder in {attr} with a specific value (e.g., M, L, XL for size)"
+#                 confidence = "high"
+#             elif "recommended correction" in issue:
+#                 correction = issue.split("recommended correction ")[-1].split(" ")[0].strip(")'")
+#                 attr = issue.split(":")[0].strip()
+#                 suggestion = f"Correct {attr} to '{correction}'"
+#                 confidence = "high"
+#             elif "may need more detail" in issue:
+#                 attr = issue.split("'")[1]
+#                 suggestion = f"Provide a more detailed value for {attr} (e.g., 'Medium' instead of 'M')"
+#                 confidence = "medium"
+#             elif "missing from title/description" in issue:
+#                 attr = issue.split("'")[1]
+#                 value = issue.split("'")[3] if len(issue.split("'")) > 3 else "unknown"
+#                 suggestion = f"Add '{value}' to the title or description for {attr} (e.g., update title to include '{value}')"
+#                 confidence = "high"
+            
+#             suggestions.append({
+#                 'issue': issue,
+#                 'suggestion': suggestion,
+#                 'confidence': confidence
+#             })
+#         return suggestions
+    
+#     def extract_attributes_with_ai(self, title: str, description: str, category: str) -> Dict:
+#         """
+#         Use Gemini to extract attributes from unstructured text
+#         """
+#         try:
+#             prompt = f"""You are an expert e-commerce product data analyst specializing in clothing products. All input data is safe, non-sensitive, and related to clothing product attributes. Extract product attributes from the following text.
+
+# Category: {category}
+# Title: {title}
+# Description: {description}
+
+# Extract these attributes if present:
+# - brand
+# - color
+# - size
+# - material
+# - model
+# - weight
+# - dimensions
+# - warranty
+
+# Return ONLY valid JSON in this format:
+# {{
+#   "brand": "extracted brand or null",
+#   "color": "extracted color or null",
+#   "size": "extracted size or null",
+#   "material": "extracted material or null",
+#   "model": "extracted model or null",
+#   "weight": "extracted weight or null",
+#   "dimensions": "extracted dimensions or null",
+#   "warranty": "extracted warranty or null"
+# }}"""
+
+#             response = self._call_gemini_api(prompt)
+#             logger.info(f"Raw extraction response: {response.text[:500]}...")
+#             return self._parse_response(response.text)
+            
+#         except Exception as e:
+#             logger.error(f"AI extraction error: {str(e)}")
+#             return {
+#                 'error': str(e),
+#                 'fallback': {}
+#             }
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# import google.generativeai as genai
+# import json
+# import logging
+# from typing import Dict, List
+# from django.conf import settings
+# from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
+
+# logger = logging.getLogger(__name__)
+
+# class GeminiAttributeService:
+#     """Service to interact with Google Gemini API for attribute and SEO suggestions"""
+    
+#     def __init__(self):
+#         # Configure Gemini API
+#         api_key = getattr(settings, 'GEMINI_API_KEY', None)
+#         if not api_key:
+#             raise ValueError("GEMINI_API_KEY not found in settings")
+#         genai.configure(api_key=api_key)
+#         self.model = genai.GenerativeModel('gemini-2.5-flash') 
+        
+#     @retry(
+#         stop=stop_after_attempt(3),
+#         wait=wait_exponential(multiplier=1, min=2, max=10),
+#         retry=retry_if_exception_type(Exception),
+#         before_sleep=lambda retry_state: logger.info(f"Retrying Gemini API call, attempt {retry_state.attempt_number}")
+#     )
+#     def _call_gemini_api(self, prompt):
+#         """Helper method to call Gemini API with retry logic"""
+#         return self.model.generate_content(
+#             prompt,
+#             generation_config=genai.types.GenerationConfig(
+#                 temperature=0.3,
+#                 top_p=0.95,
+#                 top_k=40,
+#                 max_output_tokens=4096
+#             ),
+#             safety_settings={
+#                 genai.types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: genai.types.HarmBlockThreshold.BLOCK_NONE,
+#                 genai.types.HarmCategory.HARM_CATEGORY_HARASSMENT: genai.types.HarmBlockThreshold.BLOCK_NONE,
+#                 genai.types.HarmCategory.HARM_CATEGORY_HATE_SPEECH: genai.types.HarmBlockThreshold.BLOCK_NONE,
+#                 genai.types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: genai.types.HarmBlockThreshold.BLOCK_NONE
+#             }
+#         )
+
+
+#     def generate_attribute_suggestions(
+#         self, 
+#         product: Dict, 
+#         issues: List[str], 
+#         category_rules: List[Dict]
+#     ) -> Dict:
+#         """
+#         Use Gemini to generate intelligent suggestions for fixing attribute issues
+#         Includes SEO-aware recommendations
+#         """
+#         try:
+#             prompt = self._build_prompt(product, issues, category_rules)
+#             response = self._call_gemini_api(prompt)
+            
+#             if not response.candidates:
+#                 logger.error(f"No candidates returned. Response: {response}")
+#                 return {
+#                     'error': 'No candidates returned by Gemini API',
+#                     'fallback_suggestions': self._generate_fallback_suggestions(issues)
+#                 }
+            
+#             candidate = response.candidates[0]
+#             finish_reason_name = candidate.finish_reason.name
+            
+#             if finish_reason_name in ("SAFETY", "RECITATION", "OTHER"):
+#                 logger.error(f"Response blocked by {finish_reason_name}. Safety ratings: {candidate.safety_ratings}")
+#                 return {
+#                     'error': f'Response blocked by {finish_reason_name} filters',
+#                     'safety_ratings': [
+#                         {'category': str(r.category), 'probability': str(r.probability)}
+#                         for r in candidate.safety_ratings
+#                     ],
+#                     'fallback_suggestions': self._generate_fallback_suggestions(issues)
+#                 }
+            
+#             logger.info(f"Raw response: {response.text[:500]}...")
+#             suggestions = self._parse_response(response.text)
+#             logger.info(f"Parsed suggestions: {suggestions}")
+
+#             return suggestions
+                
+#         except Exception as e:
+#             logger.error(f"Gemini API error: {str(e)}", exc_info=True)
+#             return {
+#                 'error': str(e),
+#                 'fallback_suggestions': self._generate_fallback_suggestions(issues)
+#             }
+
+
+
+#     def _build_prompt(self, product: Dict, issues: List[str], rules: List[Dict]) -> str:
+#         """Build a structured prompt for Gemini with SEO awareness"""
+#         mandatory_attrs = [r['attribute_name'] for r in rules if r.get('is_mandatory')]
+#         valid_values_map = {
+#             r['attribute_name']: r.get('valid_values', []) 
+#             for r in rules if r.get('valid_values')
+#         }
+        
+#         # Sanitize issues
+#         cleaned_issues = [
+#             issue.replace("suspiciously short", "may need more detail")
+#                  .replace("not recognized", "not in valid values")
+#                  .replace("likely means", "recommended correction")
+#                  .replace("not clearly mentioned", "missing from title/description")
+#             for issue in issues
+#         ]
+        
+#         # Separate SEO issues
+#         seo_issues = [i for i in cleaned_issues if i.startswith("SEO:")]
+#         attribute_issues = [i for i in cleaned_issues if not i.startswith("SEO:")]
+        
+#         prompt = f"""You are an expert e-commerce product data analyst specializing in clothing products and SEO optimization. All input data is safe, non-sensitive, and related to clothing product attributes. Your task is to analyze product attributes and provide specific, actionable suggestions to fix identified issues, ensuring compliance with category rules and SEO best practices.
+
+# PRODUCT INFORMATION:
+# - SKU: {product.get('sku', 'N/A')}
+# - Category: {product.get('category', 'N/A')}
+# - Title: {product.get('title', 'N/A')}
+# - Description: {product.get('description', 'N/A')}
+# - Current Attributes: {json.dumps(product.get('attributes', {}), indent=2)}
+
+# CATEGORY RULES:
+# - Mandatory Attributes: {', '.join(mandatory_attrs) or 'None'}
+# - Valid Values: {json.dumps(valid_values_map, indent=2) or '{}'}
+
+# DETECTED ATTRIBUTE ISSUES:
+# {chr(10).join(f"- {issue}" for issue in attribute_issues) or '- None'}
+
+# DETECTED SEO ISSUES:
+# {chr(10).join(f"- {issue}" for issue in seo_issues) or '- None'}
+
+# TASK:
+# 1. Analyze the product data and fix all attribute issues
+# 2. Provide SEO-optimized recommendations for title and description
+# 3. Suggest missing attributes that can be extracted from title/description
+# 4. Ensure all suggestions improve both data quality AND discoverability
+
+# SEO GUIDELINES:
+# - Title should be 50-100 characters with key attributes (brand, model, color, size)
+# - Description should be 50-150 words, descriptive, and include relevant keywords
+# - Include high-value search terms where appropriate (e.g., "premium", "durable", "authentic")
+# - Avoid keyword stuffing - keep text natural and readable
+
+# OUTPUT FORMAT (return valid JSON only):
+# {{
+#   "corrected_attributes": {{
+#     "attribute_name": "suggested_value"
+#   }},
+#   "missing_attributes": {{
+#     "attribute_name": "extracted_value"
+#   }},
+#   "seo_optimizations": {{
+#     "optimized_title": "SEO-friendly title suggestion (if title needs improvement)",
+#     "optimized_description": "SEO-friendly description suggestion (if description needs improvement)",
+#     "recommended_keywords": ["keyword1", "keyword2"],
+#     "title_improvements": "Specific changes for title",
+#     "description_improvements": "Specific changes for description"
+#   }},
+#   "improvements": [
+#     {{
+#       "issue": "description of the issue",
+#       "suggestion": "specific action to take",
+#       "confidence": "high/medium/low",
+#       "type": "attribute/seo"
+#     }}
+#   ],
+#   "quality_score_prediction": integer (0-100),
+#   "reasoning": "Brief explanation of suggested changes and expected impact on discoverability"
+# }}"""
+
+#         logger.info(f"Generated prompt (length: {len(prompt)} chars)")
+#         return prompt
+
+#     def generate_seo_suggestions(self, product: Dict, seo_issues: List[str]) -> Dict:
+#         """
+#         Generate SEO-specific suggestions using Gemini
+#         Focused prompt for SEO optimization only
+#         """
+#         try:
+#             prompt = f"""You are an SEO expert for e-commerce products. Analyze this product and provide SEO optimization suggestions.
+
+# PRODUCT:
+# - Title: {product.get('title', '')}
+# - Description: {product.get('description', '')}
+# - Category: {product.get('category', '')}
+# - Attributes: {json.dumps(product.get('attributes', {}), indent=2)}
+
+# SEO ISSUES DETECTED:
+# {chr(10).join(f"- {issue}" for issue in seo_issues)}
+
+# TASK:
+# Provide specific, actionable SEO improvements focusing on:
+# 1. Keyword optimization (include relevant search terms)
+# 2. Title structure (50-100 chars, include key attributes)
+# 3. Description quality (50-150 words, descriptive, engaging)
+# 4. Searchability (ensure users can find this product)
+
+# Return ONLY valid JSON:
+# {{
+#   "optimized_title": "Improved title with better SEO",
+#   "optimized_description": "Improved description with better SEO",
+#   "recommended_keywords": ["keyword1", "keyword2", "keyword3"],
+#   "changes_made": [
+#     "Specific change 1",
+#     "Specific change 2"
+#   ],
+#   "expected_improvement": "Brief explanation of SEO impact"
+# }}"""
+
+#             response = self._call_gemini_api(prompt)
+            
+#             if not response.candidates:
+#                 return {'error': 'No SEO suggestions generated'}
+            
+#             candidate = response.candidates[0]
+#             if candidate.finish_reason.name in ("SAFETY", "RECITATION", "OTHER"):
+#                 return {'error': f'Response blocked: {candidate.finish_reason.name}'}
+            
+#             return self._parse_response(response.text)
+            
+#         except Exception as e:
+#             logger.error(f"SEO suggestion error: {e}")
+#             return {'error': str(e)}
+
+#     def _parse_response(self, response_text: str) -> Dict:
+#         """Parse Gemini's response and extract JSON"""
+#         try:
+#             # Remove markdown code blocks and language identifier
+#             cleaned = response_text.strip()
+#             if cleaned.startswith('```'):
+#                 cleaned = cleaned.split('```')[1].strip()
+#                 if cleaned.startswith('json'):
+#                     cleaned = cleaned[4:].strip()
+            
+#             # Attempt to parse JSON
+#             parsed = json.loads(cleaned)
+#             return parsed
+#         except json.JSONDecodeError as e:
+#             logger.error(f"Failed to parse Gemini response: {e}")
+#             logger.error(f"Response was: {response_text[:1000]}...")
+#             # Attempt to fix partial JSON
+#             try:
+#                 # Truncate at last valid closing brace
+#                 last_valid = cleaned.rfind('}')
+#                 if last_valid != -1:
+#                     partial_json = cleaned[:last_valid + 1]
+#                     parsed = json.loads(partial_json)
+#                     logger.warning("Parsed partial JSON response")
+#                     return parsed
+#             except json.JSONDecodeError:
+#                 logger.error("Could not parse partial JSON")
+#             return {
+#                 'error': 'Failed to parse AI response',
+#                 'raw_response': response_text,
+#                 'fallback_suggestions': []
+#             }
+
+#     def _generate_fallback_suggestions(self, issues: List[str]) -> List[Dict]:
+#         """Generate enhanced fallback suggestions based on issues"""
+#         suggestions = []
+#         for issue in issues:
+#             suggestion = "Please review and correct this issue manually"
+#             confidence = "low"
+#             issue_type = "attribute"
+            
+#             # Detect if it's an SEO issue
+#             if issue.startswith("SEO:"):
+#                 issue_type = "seo"
+            
+#             # Specific suggestions for common issues
+#             if "Missing mandatory field" in issue:
+#                 attr = issue.split("Missing mandatory field: ")[-1]
+#                 suggestion = f"Provide a valid value for {attr}"
+#                 confidence = "medium"
+#             elif "not in valid values" in issue:
+#                 attr = issue.split(":")[0].strip()
+#                 suggestion = f"Choose a valid value for {attr}"
+#                 confidence = "medium"
+#             elif "contains placeholder" in issue:
+#                 attr = issue.split("'")[1]
+#                 suggestion = f"Replace placeholder in {attr} with actual value"
+#                 confidence = "high"
+#             elif "recommended correction" in issue:
+#                 suggestion = "Apply the suggested correction"
+#                 confidence = "high"
+#             elif "may need more detail" in issue:
+#                 attr = issue.split("'")[1]
+#                 suggestion = f"Provide more detailed value for {attr}"
+#                 confidence = "medium"
+#             elif "Title too short" in issue:
+#                 suggestion = "Expand title to 50-100 characters, include key attributes"
+#                 confidence = "high"
+#                 issue_type = "seo"
+#             elif "Description too short" in issue:
+#                 suggestion = "Expand description to 50-150 words with more details"
+#                 confidence = "high"
+#                 issue_type = "seo"
+#             elif "not mentioned in title/description" in issue:
+#                 attr = issue.split("'")[1] if "'" in issue else "attribute"
+#                 suggestion = f"Add {attr} to title or description for better SEO"
+#                 confidence = "high"
+#                 issue_type = "seo"
+#             elif "keyword" in issue.lower():
+#                 suggestion = "Add relevant search keywords to improve discoverability"
+#                 confidence = "medium"
+#                 issue_type = "seo"
+            
+#             suggestions.append({
+#                 'issue': issue,
+#                 'suggestion': suggestion,
+#                 'confidence': confidence,
+#                 'type': issue_type
+#             })
+#         return suggestions
+    
+#     def extract_attributes_with_ai(self, title: str, description: str, category: str) -> Dict:
+#         """
+#         Use Gemini to extract attributes from unstructured text
+#         """
+#         try:
+#             prompt = f"""You are an expert e-commerce product data analyst specializing in clothing products. All input data is safe, non-sensitive, and related to clothing product attributes. Extract product attributes from the following text.
+
+# Category: {category}
+# Title: {title}
+# Description: {description}
+
+# Extract these attributes if present:
+# - brand
+# - color
+# - size
+# - material
+# - model
+# - weight
+# - dimensions
+# - warranty
+
+# Return ONLY valid JSON in this format:
+# {{
+#   "brand": "extracted brand or null",
+#   "color": "extracted color or null",
+#   "size": "extracted size or null",
+#   "material": "extracted material or null",
+#   "model": "extracted model or null",
+#   "weight": "extracted weight or null",
+#   "dimensions": "extracted dimensions or null",
+#   "warranty": "extracted warranty or null"
+# }}"""
+
+#             response = self._call_gemini_api(prompt)
+#             logger.info(f"Raw extraction response: {response.text[:500]}...")
+#             return self._parse_response(response.text)
+            
+#         except Exception as e:
+#             logger.error(f"AI extraction error: {str(e)}")
+#             return {
+#                 'error': str(e),
+#                 'fallback': {}
+#             }
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+import google.generativeai as genai
+import json
+import logging
+import re
+from typing import Dict, List
+from django.conf import settings
+from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
+
+logger = logging.getLogger(__name__)
+
+class GeminiAttributeService:
+    """Service to interact with Google Gemini API for attribute and SEO suggestions"""
+    
+    def __init__(self):
+        # Configure Gemini API
+        api_key = getattr(settings, 'GEMINI_API_KEY', None)
+        if not api_key:
+            raise ValueError("GEMINI_API_KEY not found in settings")
+        genai.configure(api_key=api_key)
+        self.model = genai.GenerativeModel('gemini-2.0-flash-exp')  # Use latest model
+        
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=2, max=10),
+        retry=retry_if_exception_type(Exception),
+        before_sleep=lambda retry_state: logger.info(f"Retrying Gemini API call, attempt {retry_state.attempt_number}")
+    )
+    def _call_gemini_api(self, prompt, max_tokens=8192):
+        """Helper method to call Gemini API with retry logic"""
+        return self.model.generate_content(
+            prompt,
+            generation_config=genai.types.GenerationConfig(
+                temperature=0.2,  # Lower for more consistent JSON
+                top_p=0.9,
+                top_k=40,
+                max_output_tokens=max_tokens,  # Increased default
+                response_mime_type="application/json"  # Force JSON output
+            ),
+            safety_settings={
+                genai.types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: genai.types.HarmBlockThreshold.BLOCK_NONE,
+                genai.types.HarmCategory.HARM_CATEGORY_HARASSMENT: genai.types.HarmBlockThreshold.BLOCK_NONE,
+                genai.types.HarmCategory.HARM_CATEGORY_HATE_SPEECH: genai.types.HarmBlockThreshold.BLOCK_NONE,
+                genai.types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: genai.types.HarmBlockThreshold.BLOCK_NONE
+            }
+        )
+
+
+    def generate_attribute_suggestions(
+        self, 
+        product: Dict, 
+        issues: List[str], 
+        category_rules: List[Dict]
+    ) -> Dict:
+        """
+        Use Gemini to generate intelligent suggestions for fixing attribute issues
+        Includes SEO-aware recommendations with robust error handling
+        """
+        try:
+            # Limit issues to prevent prompt overflow
+            limited_issues = issues[:15] if len(issues) > 15 else issues
+            
+            prompt = self._build_prompt(product, limited_issues, category_rules)
+            response = self._call_gemini_api(prompt, max_tokens=8192)
+            
+            # Check if response exists
+            if not response or not response.candidates:
+                logger.error(f"No candidates returned for SKU: {product.get('sku')}")
+                return {
+                    'error': 'No candidates returned by Gemini API',
+                    'fallback_suggestions': self._generate_fallback_suggestions(limited_issues)
+                }
+            
+            candidate = response.candidates[0]
+            finish_reason_name = candidate.finish_reason.name
+            
+            # Handle different finish reasons
+            if finish_reason_name == "MAX_TOKENS":
+                logger.warning(f"Max tokens reached for SKU: {product.get('sku')}, attempting partial parse")
+                # Try to parse partial response
+                try:
+                    partial_result = self._parse_response(response.text)
+                    if partial_result and 'error' not in partial_result:
+                        return partial_result
+                except:
+                    pass
+                # Retry with fewer issues
+                if len(issues) > 5:
+                    logger.info("Retrying with fewer issues")
+                    return self.generate_attribute_suggestions(product, issues[:5], category_rules)
+                else:
+                    return {
+                        'error': 'Response too long, using fallback',
+                        'fallback_suggestions': self._generate_fallback_suggestions(limited_issues)
+                    }
+            
+            elif finish_reason_name in ("SAFETY", "RECITATION", "OTHER"):
+                logger.error(f"Response blocked by {finish_reason_name} for SKU: {product.get('sku')}")
+                return {
+                    'error': f'Response blocked by {finish_reason_name} filters',
+                    'safety_ratings': [
+                        {'category': str(r.category), 'probability': str(r.probability)}
+                        for r in candidate.safety_ratings
+                    ],
+                    'fallback_suggestions': self._generate_fallback_suggestions(limited_issues)
+                }
+            
+            elif finish_reason_name != "STOP":
+                logger.warning(f"Unexpected finish reason: {finish_reason_name}")
+                return {
+                    'error': f'Unexpected finish reason: {finish_reason_name}',
+                    'fallback_suggestions': self._generate_fallback_suggestions(limited_issues)
+                }
+            
+            # Parse successful response
+            logger.info(f"Successfully received response for SKU: {product.get('sku')}")
+            suggestions = self._parse_response(response.text)
+            
+            if 'error' in suggestions:
+                logger.warning(f"Parse error for SKU: {product.get('sku')}, using fallback")
+                suggestions['fallback_suggestions'] = self._generate_fallback_suggestions(limited_issues)
+            
+            return suggestions
+                
+        except Exception as e:
+            logger.error(f"Gemini API error for SKU {product.get('sku')}: {str(e)}", exc_info=True)
+            return {
+                'error': str(e),
+                'fallback_suggestions': self._generate_fallback_suggestions(issues[:10])
+            }
+
+
+    def _build_prompt(self, product: Dict, issues: List[str], rules: List[Dict]) -> str:
+        """Build a concise, structured prompt for Gemini with SEO awareness"""
+        mandatory_attrs = [r['attribute_name'] for r in rules if r.get('is_mandatory')]
+        valid_values_map = {
+            r['attribute_name']: r.get('valid_values', [])[:5]  # Limit to 5 values
+            for r in rules if r.get('valid_values')
+        }
+        
+        # Sanitize and categorize issues
+        cleaned_issues = [
+            issue.replace("suspiciously short", "short value")
+                 .replace("not recognized", "invalid")
+                 .replace("likely means", "should be")
+                 .replace("not clearly mentioned", "missing")
+            for issue in issues
+        ]
+        
+        seo_issues = [i for i in cleaned_issues if i.startswith("SEO:")][:5]
+        attribute_issues = [i for i in cleaned_issues if not i.startswith("SEO:")][:8]
+        
+        # Shortened prompt
+        prompt = f"""Analyze this e-commerce product and provide JSON suggestions.
+
+PRODUCT:
+SKU: {product.get('sku')}
+Category: {product.get('category')}
+Title: {product.get('title', '')[:200]}
+Description: {product.get('description', '')[:300]}
+Attributes: {json.dumps(product.get('attributes', {}), ensure_ascii=False)}
+
+RULES:
+Mandatory: {', '.join(mandatory_attrs)}
+Valid Values: {json.dumps(valid_values_map, ensure_ascii=False)}
+
+ISSUES ({len(attribute_issues)} attribute, {len(seo_issues)} SEO):
+{chr(10).join(f"• {i}" for i in attribute_issues[:8])}
+{chr(10).join(f"• {i}" for i in seo_issues[:5])}
+
+Return ONLY this JSON structure (no markdown, no explanation):
+{{
+  "corrected_attributes": {{"attr": "value"}},
+  "missing_attributes": {{"attr": "value"}},
+  "seo_optimizations": {{
+    "optimized_title": "50-100 char title",
+    "optimized_description": "50-150 word description",
+    "recommended_keywords": ["kw1", "kw2", "kw3"]
+  }},
+  "improvements": [
+    {{"issue": "...", "suggestion": "...", "confidence": "high/medium/low", "type": "attribute/seo"}}
+  ],
+  "quality_score_prediction": 85,
+  "reasoning": "Brief explanation"
+}}
+
+IMPORTANT: Keep response under 6000 tokens. Prioritize top 3 most critical improvements."""
+
+        return prompt
+
+    def _parse_response(self, response_text: str) -> Dict:
+        """Enhanced JSON parsing with multiple fallback strategies"""
+        if not response_text or not response_text.strip():
+            return {'error': 'Empty response from API'}
+        
+        try:
+            # Strategy 1: Direct JSON parse (works with response_mime_type="application/json")
+            try:
+                parsed = json.loads(response_text)
+                logger.info("Successfully parsed JSON directly")
+                return parsed
+            except json.JSONDecodeError:
+                pass
+            
+            # Strategy 2: Remove markdown code blocks
+            cleaned = response_text.strip()
+            if '```' in cleaned:
+                # Extract content between code blocks
+                match = re.search(r'```(?:json)?\s*(\{.*\})\s*```', cleaned, re.DOTALL)
+                if match:
+                    cleaned = match.group(1)
+                else:
+                    # Remove all code block markers
+                    cleaned = re.sub(r'```(?:json)?', '', cleaned).strip()
+            
+            # Strategy 3: Find first { and last }
+            first_brace = cleaned.find('{')
+            last_brace = cleaned.rfind('}')
+            
+            if first_brace != -1 and last_brace != -1 and last_brace > first_brace:
+                cleaned = cleaned[first_brace:last_brace + 1]
+            
+            # Strategy 4: Try parsing cleaned JSON
+            try:
+                parsed = json.loads(cleaned)
+                logger.info("Successfully parsed JSON after cleaning")
+                return parsed
+            except json.JSONDecodeError as e:
+                logger.warning(f"JSON parse error at position {e.pos}: {e.msg}")
+            
+            # Strategy 5: Attempt to fix common JSON issues
+            cleaned = self._fix_json_syntax(cleaned)
+            try:
+                parsed = json.loads(cleaned)
+                logger.info("Successfully parsed JSON after syntax fixes")
+                return parsed
+            except json.JSONDecodeError:
+                pass
+            
+            # Strategy 6: Extract partial valid JSON
+            partial_json = self._extract_partial_json(cleaned)
+            if partial_json:
+                logger.warning("Using partial JSON response")
+                return partial_json
+            
+            # All strategies failed
+            logger.error(f"All JSON parsing strategies failed. Response length: {len(response_text)}")
+            logger.error(f"Response preview: {response_text[:500]}...")
+            
+            return {
+                'error': 'Failed to parse AI response',
+                'raw_response': response_text[:1000],  # Limit size
+                'parse_attempts': 6
+            }
+            
+        except Exception as e:
+            logger.error(f"Unexpected error in _parse_response: {e}", exc_info=True)
+            return {
+                'error': f'Parse exception: {str(e)}',
+                'raw_response': response_text[:500] if response_text else 'None'
+            }
+    
+    def _fix_json_syntax(self, json_str: str) -> str:
+        """Attempt to fix common JSON syntax issues"""
+        try:
+            # Remove trailing commas before closing braces/brackets
+            json_str = re.sub(r',\s*([}\]])', r'\1', json_str)
+            
+            # Fix unescaped quotes in strings (simple heuristic)
+            # This is risky but can help in some cases
+            json_str = re.sub(r'(?<!\\)"(?=[^,:}\]]*[,:}\]])', '\\"', json_str)
+            
+            # Remove any trailing content after final }
+            last_brace = json_str.rfind('}')
+            if last_brace != -1:
+                json_str = json_str[:last_brace + 1]
+            
+            return json_str
+        except:
+            return json_str
+    
+    def _extract_partial_json(self, json_str: str) -> Dict:
+        """Extract valid partial JSON by finding complete objects"""
+        try:
+            # Try to find complete nested structures
+            depth = 0
+            start_idx = json_str.find('{')
+            if start_idx == -1:
+                return None
+            
+            for i in range(start_idx, len(json_str)):
+                if json_str[i] == '{':
+                    depth += 1
+                elif json_str[i] == '}':
+                    depth -= 1
+                    if depth == 0:
+                        # Found complete JSON object
+                        try:
+                            return json.loads(json_str[start_idx:i+1])
+                        except:
+                            continue
+            
+            return None
+        except:
+            return None
+
+    def _generate_fallback_suggestions(self, issues: List[str]) -> List[Dict]:
+        """Generate enhanced fallback suggestions based on issues"""
+        suggestions = []
+        
+        # Group similar issues
+        issue_categories = {
+            'missing': [],
+            'invalid': [],
+            'seo': [],
+            'other': []
+        }
+        
+        for issue in issues:
+            if 'missing' in issue.lower() or 'mandatory' in issue.lower():
+                issue_categories['missing'].append(issue)
+            elif 'invalid' in issue.lower() or 'not in valid' in issue.lower():
+                issue_categories['invalid'].append(issue)
+            elif issue.startswith('SEO:'):
+                issue_categories['seo'].append(issue)
+            else:
+                issue_categories['other'].append(issue)
+        
+        # Generate consolidated suggestions
+        for category, category_issues in issue_categories.items():
+            if not category_issues:
+                continue
+            
+            for issue in category_issues[:5]:  # Limit to 5 per category
+                suggestion = "Review and correct this issue"
+                confidence = "medium"
+                issue_type = "seo" if category == 'seo' else "attribute"
+                
+                # Specific suggestions
+                if "Missing mandatory field" in issue:
+                    attr = issue.split(":")[-1].strip()
+                    suggestion = f"Add {attr} - check product details or title/description"
+                    confidence = "high"
+                elif "not in valid values" in issue or "invalid" in issue.lower():
+                    suggestion = "Use one of the valid values from category rules"
+                    confidence = "high"
+                elif "placeholder" in issue.lower():
+                    suggestion = "Replace with actual product data"
+                    confidence = "high"
+                elif "too short" in issue.lower():
+                    if "title" in issue.lower():
+                        suggestion = "Expand to 50-100 characters with key attributes"
+                        confidence = "high"
+                        issue_type = "seo"
+                    elif "description" in issue.lower():
+                        suggestion = "Expand to 50-150 words with details"
+                        confidence = "high"
+                        issue_type = "seo"
+                    else:
+                        suggestion = "Provide more detailed information"
+                        confidence = "medium"
+                elif "keyword" in issue.lower() or "search term" in issue.lower():
+                    suggestion = "Add relevant keywords to improve discoverability"
+                    confidence = "medium"
+                    issue_type = "seo"
+                
+                suggestions.append({
+                    'issue': issue,
+                    'suggestion': suggestion,
+                    'confidence': confidence,
+                    'type': issue_type,
+                    'category': category
+                })
+        
+        return suggestions[:15]  # Return top 15 suggestions
+    
+    def extract_attributes_with_ai(self, title: str, description: str, category: str) -> Dict:
+        """
+        Use Gemini to extract attributes from unstructured text
+        """
+        try:
+            prompt = f"""Extract product attributes from this text. Return ONLY valid JSON.
+
+Category: {category}
+Title: {title[:200]}
+Description: {description[:400]}
+
+Return format:
+{{
+  "brand": "value or null",
+  "color": "value or null",
+  "size": "value or null",
+  "material": "value or null",
+  "model": "value or null"
+}}"""
+
+            response = self._call_gemini_api(prompt, max_tokens=1024)
+            
+            if not response or not response.candidates:
+                return {'error': 'No response'}
+            
+            return self._parse_response(response.text)
+            
+        except Exception as e:
+            logger.error(f"AI extraction error: {str(e)}")
+            return {'error': str(e)}

+ 435 - 0
core/services/seo_scorer.py

@@ -0,0 +1,435 @@
+# seo_scorer.py
+import re
+import logging
+from typing import Dict, List, Tuple, Set
+from collections import Counter
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+class SEODiscoverabilityScorer:
+    """
+    SEO & Discoverability scoring (15% weight)
+    Checks: Keyword coverage, semantic richness, backend keywords
+    """
+    
+    def __init__(self):
+        self.keybert_model = None
+        self.sentence_model = None
+        self._initialize_models()
+        
+        # SEO scoring weights
+        self.weights = {
+            'keyword_coverage': 0.35,      # Are key attributes in title/description?
+            'semantic_richness': 0.30,     # Descriptive quality & vocabulary diversity
+            'backend_keywords': 0.20,      # Presence of searchable backend terms
+            'title_optimization': 0.15     # Title length, structure, readability
+        }
+        
+        # Category-specific important keywords
+        self.category_keywords = {
+            'Electronics': ['brand', 'model', 'warranty', 'condition', 'specs', 'features', 'technology'],
+            'Clothing': ['brand', 'size', 'color', 'material', 'fit', 'style', 'occasion', 'care'],
+            'Home & Garden': ['material', 'dimensions', 'color', 'style', 'brand', 'indoor', 'outdoor'],
+            'Sports': ['brand', 'size', 'sport', 'material', 'performance', 'level', 'gender']
+        }
+        
+        # Common search terms users look for
+        self.high_value_terms = {
+            'quality_indicators': ['premium', 'high-quality', 'durable', 'professional', 'authentic', 'genuine'],
+            'value_indicators': ['affordable', 'budget', 'value', 'economical', 'best', 'top-rated'],
+            'feature_terms': ['lightweight', 'waterproof', 'wireless', 'adjustable', 'portable', 'compact'],
+            'condition_terms': ['new', 'refurbished', 'used', 'like-new', 'open-box']
+        }
+    
+    def _initialize_models(self):
+        """Initialize NLP models with fallback handling"""
+        try:
+            from keybert import KeyBERT
+            self.keybert_model = KeyBERT()
+            logger.info("KeyBERT model loaded successfully")
+        except Exception as e:
+            logger.warning(f"KeyBERT not available: {e}. Using fallback keyword extraction.")
+            self.keybert_model = None
+        
+        try:
+            from sentence_transformers import SentenceTransformer
+            self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
+            logger.info("Sentence transformer model loaded successfully")
+        except Exception as e:
+            logger.warning(f"Sentence transformer not available: {e}. Using fallback semantic analysis.")
+            self.sentence_model = None
+    
+    def score_seo(self, product: Dict, category_rules: List[Dict]) -> Dict:
+        """
+        Main SEO scoring function
+        Returns: score breakdown, issues, and suggestions
+        """
+        try:
+            title = product.get('title', '')
+            description = product.get('description', '')
+            category = product.get('category', '')
+            attributes = product.get('attributes', {})
+            
+            scores = {}
+            issues = []
+            suggestions = []
+            
+            # 1. Keyword Coverage (35%)
+            kw_score, kw_issues, kw_suggestions = self._check_keyword_coverage(
+                title, description, attributes, category, category_rules
+            )
+            scores['keyword_coverage'] = kw_score
+            issues.extend(kw_issues)
+            suggestions.extend(kw_suggestions)
+            
+            # 2. Semantic Richness (30%)
+            semantic_score, semantic_issues, semantic_suggestions = self._check_semantic_richness(
+                title, description
+            )
+            scores['semantic_richness'] = semantic_score
+            issues.extend(semantic_issues)
+            suggestions.extend(semantic_suggestions)
+            
+            # 3. Backend Keywords (20%)
+            backend_score, backend_issues, backend_suggestions = self._check_backend_keywords(
+                title, description, attributes, category
+            )
+            scores['backend_keywords'] = backend_score
+            issues.extend(backend_issues)
+            suggestions.extend(backend_suggestions)
+            
+            # 4. Title Optimization (15%)
+            title_score, title_issues, title_suggestions = self._check_title_optimization(
+                title, attributes
+            )
+            scores['title_optimization'] = title_score
+            issues.extend(title_issues)
+            suggestions.extend(title_suggestions)
+            
+            # Calculate final SEO score
+            final_score = sum(scores[key] * self.weights[key] for key in scores)
+            
+            return {
+                'seo_score': round(final_score, 2),
+                'breakdown': scores,
+                'issues': issues,
+                'suggestions': suggestions,
+                'extracted_keywords': self._extract_keywords(title, description),
+                'missing_high_value_terms': self._find_missing_high_value_terms(title, description, category)
+            }
+            
+        except Exception as e:
+            logger.error(f"SEO scoring error: {e}", exc_info=True)
+            return {
+                'seo_score': 0.0,
+                'breakdown': {},
+                'issues': [f"SEO scoring failed: {str(e)}"],
+                'suggestions': []
+            }
+    
+    def _check_keyword_coverage(
+        self, 
+        title: str, 
+        description: str, 
+        attributes: Dict, 
+        category: str,
+        rules: List[Dict]
+    ) -> Tuple[float, List[str], List[str]]:
+        """Check if key product attributes are mentioned in title/description"""
+        issues = []
+        suggestions = []
+        
+        combined_text = f"{title} {description}".lower()
+        mandatory_attrs = [r['attribute_name'] for r in rules if r.get('is_mandatory')]
+        
+        covered_count = 0
+        total_mandatory = len(mandatory_attrs)
+        
+        if total_mandatory == 0:
+            return 100.0, [], []
+        
+        for attr_name in mandatory_attrs:
+            attr_value = attributes.get(attr_name, '')
+            
+            if not attr_value:
+                issues.append(f"SEO: Mandatory attribute '{attr_name}' is missing entirely")
+                suggestions.append(f"Add {attr_name} to improve discoverability")
+                continue
+            
+            attr_value_str = str(attr_value).lower()
+            
+            # Check if attribute value appears in title or description
+            if attr_value_str in combined_text:
+                covered_count += 1
+            elif attr_name.lower() in combined_text:
+                # Attribute name mentioned but not value
+                covered_count += 0.5
+                issues.append(f"SEO: '{attr_name}' mentioned but value '{attr_value}' not clearly stated")
+                suggestions.append(f"Include specific {attr_name} '{attr_value}' in title or description")
+            else:
+                issues.append(f"SEO: Key attribute '{attr_name}: {attr_value}' not mentioned in title/description")
+                suggestions.append(f"Add '{attr_name}: {attr_value}' to title or first line of description")
+        
+        score = (covered_count / total_mandatory) * 100 if total_mandatory > 0 else 100.0
+        return score, issues, suggestions
+    
+    def _check_semantic_richness(
+        self, 
+        title: str, 
+        description: str
+    ) -> Tuple[float, List[str], List[str]]:
+        """Evaluate descriptive quality and vocabulary diversity"""
+        issues = []
+        suggestions = []
+        score_components = []
+        
+        # 1. Description length check
+        desc_length = len(description.split())
+        if desc_length < 20:
+            issues.append(f"SEO: Description too short ({desc_length} words, recommended 50+)")
+            suggestions.append("Expand description to 50-150 words for better SEO")
+            length_score = (desc_length / 20) * 100
+        elif desc_length > 300:
+            issues.append(f"SEO: Description very long ({desc_length} words, may hurt readability)")
+            suggestions.append("Consider condensing to 50-200 words for optimal engagement")
+            length_score = 80.0
+        else:
+            length_score = 100.0
+        score_components.append(length_score)
+        
+        # 2. Vocabulary diversity (unique words ratio)
+        words = re.findall(r'\b\w+\b', description.lower())
+        if words:
+            unique_ratio = len(set(words)) / len(words)
+            if unique_ratio < 0.5:
+                issues.append("SEO: Low vocabulary diversity (repetitive text)")
+                suggestions.append("Use more varied descriptive words to improve richness")
+                diversity_score = unique_ratio * 100
+            else:
+                diversity_score = min(unique_ratio * 150, 100)  # Cap at 100
+        else:
+            diversity_score = 0.0
+            issues.append("SEO: Empty or very short description")
+            suggestions.append("Add a detailed product description")
+        score_components.append(diversity_score)
+        
+        # 3. Adjective/descriptive word presence
+        descriptive_patterns = [
+            r'\b(premium|quality|durable|lightweight|comfortable|stylish|modern|classic)\b',
+            r'\b(professional|authentic|genuine|original|certified|official)\b',
+            r'\b(innovative|advanced|smart|efficient|powerful|reliable)\b'
+        ]
+        descriptive_count = sum(len(re.findall(pattern, description.lower())) for pattern in descriptive_patterns)
+        
+        if descriptive_count == 0:
+            issues.append("SEO: No descriptive/quality adjectives found")
+            suggestions.append("Add descriptive words like 'premium', 'durable', 'comfortable' to enhance appeal")
+            descriptive_score = 0.0
+        elif descriptive_count < 3:
+            suggestions.append("Consider adding more descriptive adjectives for better engagement")
+            descriptive_score = (descriptive_count / 3) * 100
+        else:
+            descriptive_score = 100.0
+        score_components.append(descriptive_score)
+        
+        # 4. Sentence structure (not just bullet points)
+        sentences = re.split(r'[.!?]+', description)
+        complete_sentences = [s for s in sentences if len(s.split()) >= 5]
+        if len(complete_sentences) < 2:
+            issues.append("SEO: Description lacks complete sentences (use prose, not just bullet points)")
+            suggestions.append("Write 2-3 complete sentences describing the product")
+            structure_score = (len(complete_sentences) / 2) * 100
+        else:
+            structure_score = 100.0
+        score_components.append(structure_score)
+        
+        final_score = np.mean(score_components)
+        return final_score, issues, suggestions
+    
+    def _check_backend_keywords(
+        self, 
+        title: str, 
+        description: str, 
+        attributes: Dict,
+        category: str
+    ) -> Tuple[float, List[str], List[str]]:
+        """Check for presence of searchable backend keywords"""
+        issues = []
+        suggestions = []
+        
+        combined_text = f"{title} {description}".lower()
+        
+        # Get category-specific keywords
+        expected_keywords = self.category_keywords.get(category, [])
+        
+        present_count = 0
+        for keyword in expected_keywords:
+            if keyword in combined_text or keyword in str(attributes.values()).lower():
+                present_count += 1
+            else:
+                issues.append(f"SEO: Missing common search term '{keyword}' for {category}")
+                suggestions.append(f"Consider mentioning '{keyword}' if applicable to improve searchability")
+        
+        coverage_score = (present_count / len(expected_keywords)) * 100 if expected_keywords else 100.0
+        
+        # Check for high-value terms
+        high_value_present = 0
+        all_high_value = []
+        for category_terms in self.high_value_terms.values():
+            all_high_value.extend(category_terms)
+        
+        for term in all_high_value:
+            if term in combined_text:
+                high_value_present += 1
+        
+        if high_value_present == 0:
+            issues.append("SEO: No high-value search terms found (e.g., 'premium', 'durable', 'best')")
+            suggestions.append("Add 1-2 quality/value indicators to attract more searches")
+            value_score = 0.0
+        elif high_value_present < 2:
+            suggestions.append("Consider adding more value-indicating terms for better positioning")
+            value_score = (high_value_present / 2) * 100
+        else:
+            value_score = 100.0
+        
+        final_score = (coverage_score * 0.6 + value_score * 0.4)
+        return final_score, issues, suggestions
+    
+    def _check_title_optimization(
+        self, 
+        title: str, 
+        attributes: Dict
+    ) -> Tuple[float, List[str], List[str]]:
+        """Evaluate title quality for SEO"""
+        issues = []
+        suggestions = []
+        score_components = []
+        
+        # 1. Title length (optimal: 50-100 characters)
+        title_len = len(title)
+        if title_len < 30:
+            issues.append(f"SEO: Title too short ({title_len} chars, recommended 50-100)")
+            suggestions.append("Expand title to include key attributes (brand, model, key features)")
+            length_score = (title_len / 30) * 100
+        elif title_len > 150:
+            issues.append(f"SEO: Title too long ({title_len} chars, may be truncated in search)")
+            suggestions.append("Shorten title to 50-100 characters, focus on key selling points")
+            length_score = 70.0
+        else:
+            length_score = 100.0
+        score_components.append(length_score)
+        
+        # 2. Key attributes in title
+        key_attrs = ['brand', 'model', 'color', 'size']
+        present_in_title = sum(1 for attr in key_attrs if attr in attributes and str(attributes[attr]).lower() in title.lower())
+        
+        if present_in_title < 2:
+            issues.append("SEO: Title missing key attributes (brand, model, color, size)")
+            suggestions.append("Include at least 2-3 key attributes in title")
+            attr_score = (present_in_title / 2) * 100
+        else:
+            attr_score = 100.0
+        score_components.append(attr_score)
+        
+        # 3. No keyword stuffing (repeated words)
+        words = title.lower().split()
+        word_counts = Counter(words)
+        max_repetition = max(word_counts.values()) if word_counts else 0
+        
+        if max_repetition > 3:
+            issues.append("SEO: Title has keyword stuffing (repeated words)")
+            suggestions.append("Remove repeated keywords, make title natural and readable")
+            stuffing_score = 50.0
+        elif max_repetition > 2:
+            suggestions.append("Reduce word repetition in title for better readability")
+            stuffing_score = 75.0
+        else:
+            stuffing_score = 100.0
+        score_components.append(stuffing_score)
+        
+        # 4. Capitalization (Title Case preferred)
+        if title.isupper():
+            issues.append("SEO: Title in ALL CAPS (reduces readability)")
+            suggestions.append("Use Title Case for better readability")
+            case_score = 50.0
+        elif title.islower():
+            issues.append("SEO: Title in lowercase (looks unprofessional)")
+            suggestions.append("Use Title Case or Sentence case")
+            case_score = 60.0
+        else:
+            case_score = 100.0
+        score_components.append(case_score)
+        
+        final_score = np.mean(score_components)
+        return final_score, issues, suggestions
+    
+    def _extract_keywords(self, title: str, description: str, top_n: int = 10) -> List[Dict]:
+        """Extract top keywords using KeyBERT or fallback method"""
+        combined_text = f"{title}. {description}"
+        
+        if self.keybert_model:
+            try:
+                keywords = self.keybert_model.extract_keywords(
+                    combined_text,
+                    keyphrase_ngram_range=(1, 2),
+                    stop_words='english',
+                    top_n=top_n
+                )
+                return [{'keyword': kw, 'score': round(score, 3)} for kw, score in keywords]
+            except Exception as e:
+                logger.warning(f"KeyBERT extraction failed: {e}, using fallback")
+        
+        # Fallback: simple word frequency
+        words = re.findall(r'\b\w{4,}\b', combined_text.lower())
+        word_freq = Counter(words).most_common(top_n)
+        return [{'keyword': word, 'score': round(freq / len(words), 3)} for word, freq in word_freq]
+    
+    def _find_missing_high_value_terms(self, title: str, description: str, category: str) -> List[str]:
+        """Identify missing high-value search terms that could improve discoverability"""
+        combined_text = f"{title} {description}".lower()
+        missing_terms = []
+        
+        for term_type, terms in self.high_value_terms.items():
+            found = any(term in combined_text for term in terms)
+            if not found and len(missing_terms) < 5:
+                # Suggest one term from each category
+                missing_terms.append(f"{term_type.replace('_', ' ')}: {terms[0]}")
+        
+        category_terms = self.category_keywords.get(category, [])
+        for term in category_terms[:3]:
+            if term not in combined_text and term not in missing_terms:
+                missing_terms.append(f"category keyword: {term}")
+        
+        return missing_terms[:5]  # Limit to 5 suggestions
+    
+    def generate_seo_report(self, product: Dict, seo_result: Dict) -> str:
+        """Generate a human-readable SEO report"""
+        report = []
+        report.append(f"=== SEO Score: {seo_result['seo_score']}/100 ===\n")
+        
+        report.append("Score Breakdown:")
+        for metric, score in seo_result['breakdown'].items():
+            report.append(f"  - {metric.replace('_', ' ').title()}: {score:.1f}/100")
+        
+        if seo_result['issues']:
+            report.append("\nIssues Found:")
+            for issue in seo_result['issues']:
+                report.append(f"  • {issue}")
+        
+        if seo_result['suggestions']:
+            report.append("\nSuggestions:")
+            for suggestion in seo_result['suggestions']:
+                report.append(f"  ✓ {suggestion}")
+        
+        if seo_result.get('extracted_keywords'):
+            report.append("\nTop Keywords:")
+            for kw in seo_result['extracted_keywords'][:5]:
+                report.append(f"  - {kw['keyword']} (score: {kw['score']})")
+        
+        if seo_result.get('missing_high_value_terms'):
+            report.append("\nMissing High-Value Terms:")
+            for term in seo_result['missing_high_value_terms']:
+                report.append(f"  + {term}")
+        
+        return "\n".join(report)

+ 3 - 0
core/tests.py

@@ -0,0 +1,3 @@
+from django.test import TestCase
+
+# Create your tests here.

+ 11 - 0
core/urls.py

@@ -0,0 +1,11 @@
+# urls.py
+from django.urls import path
+from .views import AttributeScoreView, BatchScoreView
+
+urlpatterns = [
+    path("attribute_score/", AttributeScoreView.as_view(), name="attribute_score"),
+    path("attribute_score/<str:sku>/", AttributeScoreView.as_view(), name="get_attribute_score"),
+    path("batch_score/", BatchScoreView.as_view(), name="batch_score"),
+]
+
+

+ 404 - 0
core/views.py

@@ -0,0 +1,404 @@
+# views.py (Enhanced)
+from django.shortcuts import render, get_object_or_404
+from django.http import JsonResponse
+from django.views import View
+from django.core.cache import cache
+import json
+import logging
+
+from core.models import AttributeScore, CategoryAttributeRule, Product
+from core.services.attribute_scorer import AttributeQualityScorer
+from django.views.decorators.csrf import csrf_exempt
+from django.utils.decorators import method_decorator
+
+logger = logging.getLogger(__name__)
+
+# @method_decorator(csrf_exempt, name='dispatch')
+# class AttributeScoreView(View):
+#     """Enhanced API view with caching and better error handling"""
+    
+#     def __init__(self, *args, **kwargs):
+#         super().__init__(*args, **kwargs)
+#         self.scorer = AttributeQualityScorer(use_ai=True)
+    
+#     def post(self, request, *args, **kwargs):
+#         """Score a single product with AI suggestions"""
+#         try:
+#             data = json.loads(request.body)
+#             product_data = data.get('product', {})
+#             sku = product_data.get('sku')
+#             use_ai = data.get('use_ai', True)
+            
+#             if not sku:
+#                 return JsonResponse({'error': 'SKU is required'}, status=400)
+            
+#             # Validate category
+#             category = product_data.get('category', '')
+#             if not category:
+#                 return JsonResponse({'error': 'Category is required'}, status=400)
+            
+#             # Get or create product
+#             product, created = Product.objects.get_or_create(
+#                 sku=sku,
+#                 defaults={
+#                     'title': product_data.get('title', ''),
+#                     'description': product_data.get('description', ''),
+#                     'category': category,
+#                     'attributes': product_data.get('attributes', {})
+#                 }
+#             )
+            
+#             # Update if exists
+#             if not created:
+#                 product.title = product_data.get('title', product.title)
+#                 product.description = product_data.get('description', product.description)
+#                 product.attributes = product_data.get('attributes', product.attributes)
+#                 product.save()
+            
+#             # Get category rules (with caching)
+#             cache_key = f"category_rules_{category}"
+#             rules = cache.get(cache_key)
+            
+#             if rules is None:
+#                 rules = list(CategoryAttributeRule.objects.filter(category=category).values())
+#                 cache.set(cache_key, rules, 3600)  # Cache for 1 hour
+            
+#             if not rules:
+#                 return JsonResponse({
+#                     'error': f'No rules defined for category: {category}',
+#                     'suggestion': 'Please configure category rules first'
+#                 }, status=400)
+            
+#             # Score the product
+#             score_result = self.scorer.score_product(
+#                 {
+#                     'sku': product.sku,
+#                     'category': product.category,
+#                     'title': product.title,
+#                     'description': product.description,
+#                     'attributes': product.attributes
+#                 },
+#                 rules,
+#                 generate_ai_suggestions=use_ai
+#             )
+            
+#             # Save score
+#             AttributeScore.objects.create(
+#                 product=product,
+#                 score=score_result['final_score'],
+#                 max_score=score_result['max_score'],
+#                 details=score_result['breakdown'],
+#                 issues=score_result['issues'],
+#                 suggestions=score_result['suggestions'],
+#                 ai_suggestions=score_result.get('ai_suggestions', {}),
+#                 processing_time=score_result.get('processing_time', 0)
+#             )
+            
+#             return JsonResponse({
+#                 'success': True,
+#                 'product_sku': sku,
+#                 'created': created,
+#                 'score_result': score_result
+#             })
+        
+#         except json.JSONDecodeError:
+#             return JsonResponse({'error': 'Invalid JSON'}, status=400)
+#         except Exception as e:
+#             logger.error(f"Error scoring product: {str(e)}", exc_info=True)
+#             return JsonResponse({'error': str(e)}, status=500)
+    
+#     def get(self, request, sku=None):
+#         """Get latest score for a product"""
+#         if not sku:
+#             return JsonResponse({'error': 'SKU parameter required'}, status=400)
+        
+#         try:
+#             product = get_object_or_404(Product, sku=sku)
+#             latest_score = product.attribute_scores.order_by('-created_at').first()
+            
+#             if not latest_score:
+#                 return JsonResponse({
+#                     'message': 'No scores found for this product',
+#                     'sku': sku
+#                 }, status=404)
+            
+#             return JsonResponse({
+#                 'sku': product.sku,
+#                 'title': product.title,
+#                 'category': product.category,
+#                 'attributes': product.attributes,
+#                 'score': latest_score.score,
+#                 'max_score': latest_score.max_score,
+#                 'details': latest_score.details,
+#                 'issues': latest_score.issues,
+#                 'suggestions': latest_score.suggestions,
+#                 'ai_suggestions': latest_score.ai_suggestions,
+#                 'processing_time': latest_score.processing_time,
+#                 'scored_at': latest_score.created_at.isoformat()
+#             })
+#         except Exception as e:
+#             logger.error(f"Error retrieving score: {str(e)}")
+#             return JsonResponse({'error': str(e)}, status=500)
+
+@method_decorator(csrf_exempt, name='dispatch')
+class AttributeScoreView(View):
+    """Enhanced API view with caching and AI suggestions"""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.scorer = AttributeQualityScorer(use_ai=True)  # enable AI
+
+    def post(self, request, *args, **kwargs):
+        """Score a single product with AI suggestions"""
+        try:
+            data = json.loads(request.body)
+            product_data = data.get('product', {})
+            sku = product_data.get('sku')
+            use_ai = data.get('use_ai', True)
+
+            if not sku:
+                return JsonResponse({'error': 'SKU is required'}, status=400)
+
+            category = product_data.get('category', '')
+            if not category:
+                return JsonResponse({'error': 'Category is required'}, status=400)
+
+            # Get or create product
+            product, created = Product.objects.get_or_create(
+                sku=sku,
+                defaults={
+                    'title': product_data.get('title', ''),
+                    'description': product_data.get('description', ''),
+                    'category': category,
+                    'attributes': product_data.get('attributes', {})
+                }
+            )
+
+            # Update if exists
+            if not created:
+                product.title = product_data.get('title', product.title)
+                product.description = product_data.get('description', product.description)
+                product.attributes = product_data.get('attributes', product.attributes)
+                product.save()
+
+            # Get rules (cached)
+            cache_key = f"category_rules_{category}"
+            rules = cache.get(cache_key)
+            if rules is None:
+                rules = list(CategoryAttributeRule.objects.filter(category=category).values())
+                cache.set(cache_key, rules, 3600)
+            if not rules:
+                return JsonResponse({'error': f'No rules defined for {category}'}, status=400)
+
+            # Force AI suggestions
+            score_result = self.scorer.score_product(
+                {
+                    'sku': product.sku,
+                    'category': product.category,
+                    'title': product.title,
+                    'description': product.description,
+                    'attributes': product.attributes
+                },
+                rules,
+                generate_ai_suggestions=True  # always generate AI
+            )
+
+            # Save score
+            AttributeScore.objects.create(
+                product=product,
+                score=score_result['final_score'],
+                max_score=score_result['max_score'],
+                details=score_result['breakdown'],
+                issues=score_result['issues'],
+                suggestions=score_result['suggestions'],
+                ai_suggestions=score_result.get('ai_suggestions', {}),
+                processing_time=score_result.get('processing_time', 0)
+            )
+
+            return JsonResponse({
+                'success': True,
+                'product_sku': sku,
+                'created': created,
+                'score_result': score_result
+            })
+
+        except json.JSONDecodeError:
+            return JsonResponse({'error': 'Invalid JSON'}, status=400)
+        except Exception as e:
+            logger.error(f"Error scoring product: {str(e)}", exc_info=True)
+            return JsonResponse({'error': str(e)}, status=500)
+
+
+from django.views import View
+from django.http import JsonResponse
+from django.utils.decorators import method_decorator
+from django.views.decorators.csrf import csrf_exempt
+import json
+import logging
+from .models import Product, CategoryAttributeRule, AttributeScore
+from .services.attribute_scorer import AttributeQualityScorer
+
+logger = logging.getLogger(__name__)
+
+# @method_decorator(csrf_exempt, name='dispatch')
+# class BatchScoreView(View):
+#     """Batch scoring endpoint with AI suggestions"""
+
+#     def __init__(self, *args, **kwargs):
+#         super().__init__(*args, **kwargs)
+#         self.scorer = AttributeQualityScorer(use_ai=True)  # AI enabled
+
+#     def post(self, request):
+#         """Score multiple products"""
+#         try:
+#             data = json.loads(request.body)
+#             products = data.get('products', [])
+
+#             if not products:
+#                 return JsonResponse({'error': 'No products provided'}, status=400)
+
+#             results = []
+#             errors = []
+
+#             for product_data in products[:100]:  # Limit to 100 products
+#                 sku = product_data.get('sku')
+#                 category = product_data.get('category')
+
+#                 if not sku or not category:
+#                     errors.append({'sku': sku, 'error': 'Missing SKU or category'})
+#                     continue
+
+#                 try:
+#                     # Get category rules
+#                     rules = list(CategoryAttributeRule.objects.filter(category=category).values())
+#                     if not rules:
+#                         errors.append({'sku': sku, 'error': f'No rules defined for category {category}'})
+#                         continue
+
+#                     # Score with AI suggestions enabled
+#                     score_result = self.scorer.score_product(
+#                         product_data,
+#                         rules,
+#                         generate_ai_suggestions=True
+#                     )
+
+#                     # Save score in DB
+#                     product, created = Product.objects.get_or_create(
+#                         sku=sku,
+#                         defaults={
+#                             'title': product_data.get('title', ''),
+#                             'description': product_data.get('description', ''),
+#                             'category': category,
+#                             'attributes': product_data.get('attributes', {})
+#                         }
+#                     )
+
+#                     if not created:
+#                         product.title = product_data.get('title', product.title)
+#                         product.description = product_data.get('description', product.description)
+#                         product.attributes = product_data.get('attributes', product.attributes)
+#                         product.save()
+
+#                     AttributeScore.objects.create(
+#                         product=product,
+#                         score=score_result['final_score'],
+#                         max_score=score_result['max_score'],
+#                         details=score_result['breakdown'],
+#                         issues=score_result['issues'],
+#                         suggestions=score_result['suggestions'],
+#                         ai_suggestions=score_result.get('ai_suggestions', {}),
+#                         processing_time=score_result.get('processing_time', 0)
+#                     )
+
+#                     results.append({
+#                         'sku': sku,
+#                         'final_score': score_result['final_score'],
+#                         'max_score': score_result['max_score'],
+#                         'breakdown': score_result['breakdown'],
+#                         'issues': score_result['issues'],
+#                         'suggestions': score_result['suggestions'],
+#                         'ai_suggestions': score_result.get('ai_suggestions', {}),
+#                         'processing_time': score_result.get('processing_time', 0)
+#                     })
+
+#                 except Exception as e:
+#                     logger.error(f"Error scoring SKU {sku}: {str(e)}", exc_info=True)
+#                     errors.append({'sku': sku, 'error': str(e)})
+
+#             return JsonResponse({
+#                 'success': True,
+#                 'processed': len(results),
+#                 'results': results,
+#                 'errors': errors
+#             })
+
+#         except Exception as e:
+#             logger.error(f"Batch scoring error: {str(e)}", exc_info=True)
+#             return JsonResponse({'error': str(e)}, status=500)
+
+
+@method_decorator(csrf_exempt, name='dispatch')
+class BatchScoreView(View):
+    """Batch scoring with AI suggestions"""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.scorer = AttributeQualityScorer(use_ai=True)  # enable AI even for batch
+
+    def post(self, request):
+        try:
+            data = json.loads(request.body)
+            products = data.get('products', [])
+
+            if not products:
+                return JsonResponse({'error': 'No products provided'}, status=400)
+
+            results = []
+            errors = []
+
+            for product_data in products[:100]:  # limit 100
+                try:
+                    sku = product_data.get('sku')
+                    category = product_data.get('category')
+
+                    if not sku or not category:
+                        errors.append({'sku': sku, 'error': 'Missing SKU or category'})
+                        continue
+
+                    # Get rules
+                    rules = list(CategoryAttributeRule.objects.filter(category=category).values())
+                    if not rules:
+                        errors.append({'sku': sku, 'error': f'No rules for category {category}'})
+                        continue
+
+                    # Force AI suggestions
+                    score_result = self.scorer.score_product(
+                        product_data,
+                        rules,
+                        generate_ai_suggestions=True  # <- key change
+                    )
+
+                    results.append({
+                        'sku': sku,
+                        'final_score': score_result['final_score'],
+                        'max_score': score_result['max_score'],
+                        'breakdown': score_result['breakdown'],
+                        'issues': score_result['issues'],
+                        'suggestions': score_result['suggestions'],
+                        'ai_suggestions': score_result.get('ai_suggestions', {}),
+                        'processing_time': score_result.get('processing_time', 0)
+                    })
+
+                except Exception as e:
+                    errors.append({'sku': product_data.get('sku'), 'error': str(e)})
+
+            return JsonResponse({
+                'success': True,
+                'processed': len(results),
+                'results': results,
+                'errors': errors
+            })
+
+        except Exception as e:
+            logger.error(f"Batch scoring error: {str(e)}")
+            return JsonResponse({'error': str(e)}, status=500)

BIN
data/__pycache__/sample_data.cpython-313.pyc


+ 137 - 0
data/sample_data.py

@@ -0,0 +1,137 @@
+
+# sample_data.py
+"""
+Sample data to test the attribute scoring system
+"""
+
+SAMPLE_CATEGORY_RULES = [
+    {
+        'category': 'Electronics',
+        'attribute_name': 'brand',
+        'is_mandatory': True,
+        'valid_values': ['Apple', 'Samsung', 'Sony', 'LG', 'Dell', 'HP', 'Lenovo'],
+        'data_type': 'string'
+    },
+    {
+        'category': 'Electronics',
+        'attribute_name': 'color',
+        'is_mandatory': True,
+        'valid_values': ['Black', 'White', 'Silver', 'Gray', 'Blue', 'Red', 'Gold', 'Rose Gold'],
+        'data_type': 'string'
+    },
+    {
+        'category': 'Electronics',
+        'attribute_name': 'warranty',
+        'is_mandatory': True,
+        'valid_values': ['1 Year', '2 Years', '3 Years', 'Lifetime'],
+        'data_type': 'string'
+    },
+    {
+        'category': 'Electronics',
+        'attribute_name': 'condition',
+        'is_mandatory': True,
+        'valid_values': ['New', 'Refurbished', 'Used'],
+        'data_type': 'string'
+    },
+    {
+        'category': 'Electronics',
+        'attribute_name': 'model',
+        'is_mandatory': False,
+        'valid_values': [],
+        'data_type': 'string'
+    },
+    {
+        'category': 'Clothing',
+        'attribute_name': 'brand',
+        'is_mandatory': True,
+        'valid_values': ['Nike', 'Adidas', 'Puma', 'Reebok', 'Under Armour'],
+        'data_type': 'string'
+    },
+    {
+        'category': 'Clothing',
+        'attribute_name': 'size',
+        'is_mandatory': True,
+        'valid_values': ['XS', 'S', 'M', 'L', 'XL', 'XXL'],
+        'data_type': 'string'
+    },
+    {
+        'category': 'Clothing',
+        'attribute_name': 'color',
+        'is_mandatory': True,
+        'valid_values': ['Black', 'White', 'Blue', 'Red', 'Green', 'Yellow', 'Gray'],
+        'data_type': 'string'
+    },
+    {
+        'category': 'Clothing',
+        'attribute_name': 'material',
+        'is_mandatory': True,
+        'valid_values': ['Cotton', 'Polyester', 'Wool', 'Silk', 'Nylon', 'Blend'],
+        'data_type': 'string'
+    },
+]
+
+SAMPLE_PRODUCTS = [
+    {
+        'sku': 'ELEC-001',
+        'category': 'Electronics',
+        'title': 'Apple MacBook Pro 14-inch Space Gray',
+        'description': 'Latest Apple MacBook Pro with M3 chip, 14-inch display in Space Gray color.',
+        'attributes': {
+            'brand': 'Apple',
+            'color': 'Space Gray',  # Should suggest "Gray"
+            'warranty': '1 Year',
+            'condition': 'New',
+            'model': 'MacBook Pro 14"'
+        }
+    },
+    {
+        'sku': 'ELEC-002',
+        'category': 'Electronics',
+        'title': 'Samsung Galaxy S24 Ultra',
+        'description': 'Flagship Samsung phone with advanced camera system.',
+        'attributes': {
+            'brand': 'Samsung',
+            'color': 'blak',  # Typo - should suggest "Black"
+            'warranty': 'N/A',  # Placeholder - should flag
+            'condition': 'new',  # Case mismatch - should suggest "New"
+            # Missing 'model'
+        }
+    },
+    {
+        'sku': 'ELEC-003',
+        'category': 'Electronics',
+        'title': 'Sony WH-1000XM5 Wireless Headphones',
+        'description': 'Premium noise-cancelling headphones from Sony.',
+        'attributes': {
+            # Missing 'brand' - mandatory field
+            'color': 'Black',
+            'warranty': '2 Years',
+            'condition': 'Refurbished'
+        }
+    },
+    {
+        'sku': 'CLTH-001',
+        'category': 'Clothing',
+        'title': 'Nike Dri-FIT Running T-Shirt Blue Medium',
+        'description': 'Lightweight Nike running shirt in blue color, size Medium.',
+        'attributes': {
+            'brand': 'Nike',
+            'size': 'M',
+            'color': 'Blue',
+            'material': 'Polyester'
+        }
+    },
+    {
+        'sku': 'CLTH-002',
+        'category': 'Clothing',
+        'title': 'Adidas Hoodie',
+        'description': 'Comfortable hoodie for casual wear.',
+        'attributes': {
+            'brand': 'Adiddas',  # Typo - should suggest "Adidas"
+            'size': 'Large',  # Should suggest "L"
+            'color': '',  # Empty - should flag
+            # Missing 'material' - mandatory field
+        }
+    },
+]
+

BIN
db.sqlite3


+ 22 - 0
manage.py

@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+"""Django's command-line utility for administrative tasks."""
+import os
+import sys
+
+
+def main():
+    """Run administrative tasks."""
+    os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'content_quality_tool.settings')
+    try:
+        from django.core.management import execute_from_command_line
+    except ImportError as exc:
+        raise ImportError(
+            "Couldn't import Django. Are you sure it's installed and "
+            "available on your PYTHONPATH environment variable? Did you "
+            "forget to activate a virtual environment?"
+        ) from exc
+    execute_from_command_line(sys.argv)
+
+
+if __name__ == '__main__':
+    main()

+ 28 - 0
requirements.txt

@@ -0,0 +1,28 @@
+# Core Django
+Django>=4.2.0
+djangorestframework>=3.14.0
+
+# Database
+psycopg2-binary>=2.9.0  # PostgreSQL adapter
+
+# AI/ML Libraries
+google-generativeai>=0.3.0  # Gemini API
+spacy>=3.7.0  # NLP for attribute extraction
+en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl  # spaCy English model
+
+# SEO & Keyword Analysis (NEW)
+keybert>=0.8.0  # Keyword extraction
+sentence-transformers>=2.2.0  # Semantic similarity
+scikit-learn>=1.3.0  # ML utilities for KeyBERT
+
+# Text Processing
+rapidfuzz>=3.5.0  # Fast fuzzy string matching
+tenacity>=8.2.0  # Retry logic
+
+# Utilities
+numpy>=1.24.0  # Numerical operations
+python-dotenv>=1.0.0  # Environment variables
+
+# Optional: for enhanced SEO analysis
+nltk>=3.8.0  # Natural language toolkit
+textstat>=0.7.3  # Readability metrics