Harshit Pathak пре 3 месеци
родитељ
комит
ae7867812b

+ 0 - 0
attr_extraction/__init__.py


+ 3 - 0
attr_extraction/admin.py

@@ -0,0 +1,3 @@
+from django.contrib import admin
+
+# Register your models here.

+ 6 - 0
attr_extraction/apps.py

@@ -0,0 +1,6 @@
+from django.apps import AppConfig
+
+
+class AttrExtractionConfig(AppConfig):
+    default_auto_field = 'django.db.models.BigAutoField'
+    name = 'attr_extraction'

+ 27 - 0
attr_extraction/migrations/0001_initial.py

@@ -0,0 +1,27 @@
+# Generated by Django 5.2.7 on 2025-10-17 10:21
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    initial = True
+
+    dependencies = [
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='Product',
+            fields=[
+                ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('item_id', models.CharField(max_length=100, unique=True)),
+                ('product_name', models.CharField(max_length=255)),
+                ('product_long_description', models.TextField(blank=True, null=True)),
+                ('product_short_description', models.TextField(blank=True, null=True)),
+                ('product_type', models.CharField(blank=True, max_length=100, null=True)),
+                ('image_path', models.CharField(blank=True, max_length=500, null=True)),
+                ('image', models.ImageField(blank=True, null=True, upload_to='products/')),
+            ],
+        ),
+    ]

+ 0 - 0
attr_extraction/migrations/__init__.py


+ 16 - 0
attr_extraction/models.py

@@ -0,0 +1,16 @@
+from django.db import models
+
+class Product(models.Model):
+    """
+    Stores product details
+    """
+    item_id = models.CharField(max_length=100, unique=True)
+    product_name = models.CharField(max_length=255)
+    product_long_description = models.TextField(blank=True, null=True)
+    product_short_description = models.TextField(blank=True, null=True)
+    product_type = models.CharField(max_length=100, blank=True, null=True)
+    image_path = models.CharField(max_length=500, blank=True, null=True)
+    image = models.ImageField(upload_to='products/', blank=True, null=True)
+
+    def __str__(self):
+        return f"{self.product_name} ({self.item_id})"

+ 151 - 0
attr_extraction/ocr_service.py

@@ -0,0 +1,151 @@
+# ==================== ocr_service.py ====================
+import cv2
+import easyocr
+import numpy as np
+import re
+import requests
+from io import BytesIO
+from PIL import Image
+from typing import List, Tuple, Dict, Optional
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class OCRService:
+    """Service for extracting text from product images using OCR."""
+    
+    def __init__(self):
+        self.reader = None
+    
+    def _get_reader(self):
+        """Lazy load EasyOCR reader."""
+        if self.reader is None:
+            self.reader = easyocr.Reader(['en'], gpu=False)
+        return self.reader
+    
+    def download_image(self, image_url: str) -> Optional[np.ndarray]:
+        """Download image from URL and convert to OpenCV format."""
+        try:
+            response = requests.get(image_url, timeout=10)
+            response.raise_for_status()
+            
+            # Convert to PIL Image then to OpenCV format
+            pil_image = Image.open(BytesIO(response.content))
+            image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
+            return image
+        except Exception as e:
+            logger.error(f"Error downloading image from {image_url}: {str(e)}")
+            return None
+    
+    def preprocess_horizontal(self, image: np.ndarray) -> np.ndarray:
+        """Preprocess image for horizontal text."""
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        enhanced = cv2.GaussianBlur(gray, (5, 5), 0)
+        _, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+        return binary
+    
+    def preprocess_vertical(self, image: np.ndarray) -> np.ndarray:
+        """Preprocess image for vertical text."""
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        enhanced = cv2.equalizeHist(gray)
+        thresh = cv2.adaptiveThreshold(
+            enhanced, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 15, 10
+        )
+        return thresh
+    
+    def detect_text_regions(self, image: np.ndarray, preprocess_func) -> List[Tuple]:
+        """Detect text regions using contours."""
+        processed = preprocess_func(image)
+        contours, _ = cv2.findContours(processed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        text_regions = []
+        for contour in contours:
+            x, y, w, h = cv2.boundingRect(contour)
+            if w > 30 and h > 30:  # Filter small regions
+                aspect_ratio = h / w
+                text_regions.append((x, y, w, h, aspect_ratio))
+        return text_regions
+    
+    def classify_and_extract_text(self, image: np.ndarray, regions: List[Tuple]) -> List[Tuple]:
+        """Classify regions as horizontal or vertical and extract text."""
+        reader = self._get_reader()
+        all_detected_text = []
+        
+        for (x, y, w, h, aspect_ratio) in regions:
+            roi = image[y:y + h, x:x + w]
+            if aspect_ratio > 1.5:  # Vertical text
+                roi = cv2.rotate(roi, cv2.ROTATE_90_CLOCKWISE)
+            
+            results = reader.readtext(roi, detail=1)
+            for _, text, confidence in results:
+                all_detected_text.append((text, confidence))
+        
+        return all_detected_text
+    
+    def clean_ocr_output(self, ocr_results: List[Tuple], confidence_threshold: float = 0.40) -> List[Tuple]:
+        """Clean OCR results by removing unwanted characters and low-confidence detections."""
+        cleaned_results = []
+        for text, confidence in ocr_results:
+            if confidence < confidence_threshold:
+                continue
+            
+            # Remove unwanted characters using regex
+            cleaned_text = re.sub(r"[^A-Za-z0-9\s\.\,\(\)\-\%\/]", "", text)
+            cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()
+            
+            # Remove unwanted numeric characters like single digits
+            if len(cleaned_text) == 1 and cleaned_text.isdigit():
+                continue
+            
+            if any(char.isdigit() for char in cleaned_text) and len(cleaned_text) < 2:
+                continue
+            
+            if len(cleaned_text.strip()) > 0:
+                cleaned_results.append((cleaned_text.strip(), confidence))
+        
+        return cleaned_results
+    
+    def process_image(self, image_url: str) -> Dict:
+        """Main method to process image and extract text."""
+        try:
+            # Download image
+            image = self.download_image(image_url)
+            if image is None:
+                return {
+                    "detected_text": [],
+                    "extracted_attributes": {},
+                    "error": "Failed to download image"
+                }
+            
+            # Detect and process horizontal text
+            horizontal_regions = self.detect_text_regions(image, self.preprocess_horizontal)
+            horizontal_text = self.classify_and_extract_text(image, horizontal_regions)
+            
+            # Detect and process vertical text
+            vertical_regions = self.detect_text_regions(image, self.preprocess_vertical)
+            vertical_text = self.classify_and_extract_text(image, vertical_regions)
+            
+            # Combine results
+            all_text = horizontal_text + vertical_text
+            
+            # Clean results
+            cleaned_results = self.clean_ocr_output(all_text, confidence_threshold=0.40)
+            
+            # Format for response
+            detected_text = [
+                {"text": text, "confidence": float(confidence)} 
+                for text, confidence in cleaned_results
+            ]
+            
+            return {
+                "detected_text": detected_text,
+                "extracted_attributes": {}
+            }
+            
+        except Exception as e:
+            logger.error(f"Error processing image: {str(e)}")
+            return {
+                "detected_text": [],
+                "extracted_attributes": {},
+                "error": str(e)
+            }

+ 196 - 0
attr_extraction/serializers.py

@@ -0,0 +1,196 @@
+# from rest_framework import serializers
+
+# class ProductInputSerializer(serializers.Serializer):
+#     """Serializer for individual product input."""
+#     product_id = serializers.CharField(required=False, allow_blank=True, allow_null=True)
+#     title = serializers.CharField(required=False, allow_blank=True, allow_null=True)
+#     short_desc = serializers.CharField(required=False, allow_blank=True, allow_null=True)
+#     long_desc = serializers.CharField(required=False, allow_blank=True, allow_null=True)
+
+
+# class SingleProductRequestSerializer(serializers.Serializer):
+#     """Serializer for single product extraction request."""
+#     title = serializers.CharField(required=False, allow_blank=True, allow_null=True)
+#     short_desc = serializers.CharField(required=False, allow_blank=True, allow_null=True)
+#     long_desc = serializers.CharField(required=False, allow_blank=True, allow_null=True)
+#     mandatory_attrs = serializers.DictField(
+#         child=serializers.ListField(child=serializers.CharField()),
+#         required=True
+#     )
+#     model = serializers.CharField(required=False, default="llama-3.1-8b-instant")
+#     extract_additional = serializers.BooleanField(required=False, default=True)
+
+#     def validate_model(self, value):
+#         from django.conf import settings
+#         if value not in settings.SUPPORTED_MODELS:
+#             raise serializers.ValidationError(
+#                 f"Model must be one of {settings.SUPPORTED_MODELS}"
+#             )
+#         return value
+
+
+# class BatchProductRequestSerializer(serializers.Serializer):
+#     """Serializer for batch product extraction request."""
+#     products = serializers.ListField(
+#         child=ProductInputSerializer(),
+#         required=True,
+#         min_length=1
+#     )
+#     mandatory_attrs = serializers.DictField(
+#         child=serializers.ListField(child=serializers.CharField()),
+#         required=True
+#     )
+#     model = serializers.CharField(required=False, default="llama-3.1-8b-instant")
+#     extract_additional = serializers.BooleanField(required=False, default=True)
+
+#     def validate_model(self, value):
+#         from django.conf import settings
+#         if value not in settings.SUPPORTED_MODELS:
+#             raise serializers.ValidationError(
+#                 f"Model must be one of {settings.SUPPORTED_MODELS}"
+#             )
+#         return value
+
+#     def validate_products(self, value):
+#         from django.conf import settings
+#         max_size = getattr(settings, 'MAX_BATCH_SIZE', 100)
+#         if len(value) > max_size:
+#             raise serializers.ValidationError(
+#                 f"Batch size cannot exceed {max_size} products"
+#             )
+#         return value
+
+
+# class ProductAttributeResultSerializer(serializers.Serializer):
+#     """Serializer for individual product extraction result."""
+#     product_id = serializers.CharField(required=False)
+#     mandatory = serializers.DictField()
+#     additional = serializers.DictField(required=False)
+#     error = serializers.CharField(required=False)
+#     raw_output = serializers.CharField(required=False)
+
+
+# class BatchProductResponseSerializer(serializers.Serializer):
+#     """Serializer for batch extraction response."""
+#     results = serializers.ListField(child=ProductAttributeResultSerializer())
+#     total_products = serializers.IntegerField()
+#     successful = serializers.IntegerField()
+#     failed = serializers.IntegerField()
+
+
+
+
+
+
+
+
+# ==================== serializers.py ====================
+from rest_framework import serializers
+
+class ProductInputSerializer(serializers.Serializer):
+    """Serializer for individual product input."""
+    product_id = serializers.CharField(required=False, allow_blank=True, allow_null=True)
+    title = serializers.CharField(required=False, allow_blank=True, allow_null=True)
+    short_desc = serializers.CharField(required=False, allow_blank=True, allow_null=True)
+    long_desc = serializers.CharField(required=False, allow_blank=True, allow_null=True)
+    image_url = serializers.URLField(required=False, allow_blank=True, allow_null=True)
+
+
+class SingleProductRequestSerializer(serializers.Serializer):
+    """Serializer for single product extraction request."""
+    title = serializers.CharField(required=False, allow_blank=True, allow_null=True)
+    short_desc = serializers.CharField(required=False, allow_blank=True, allow_null=True)
+    long_desc = serializers.CharField(required=False, allow_blank=True, allow_null=True)
+    image_url = serializers.URLField(required=False, allow_blank=True, allow_null=True)
+    mandatory_attrs = serializers.DictField(
+        child=serializers.ListField(child=serializers.CharField()),
+        required=True
+    )
+    model = serializers.CharField(required=False, default="llama-3.1-8b-instant")
+    extract_additional = serializers.BooleanField(required=False, default=True)
+    process_image = serializers.BooleanField(required=False, default=True)
+
+    def validate_model(self, value):
+        from django.conf import settings
+        if value not in settings.SUPPORTED_MODELS:
+            raise serializers.ValidationError(
+                f"Model must be one of {settings.SUPPORTED_MODELS}"
+            )
+        return value
+
+
+class BatchProductRequestSerializer(serializers.Serializer):
+    """Serializer for batch product extraction request."""
+    products = serializers.ListField(
+        child=ProductInputSerializer(),
+        required=True,
+        min_length=1
+    )
+    mandatory_attrs = serializers.DictField(
+        child=serializers.ListField(child=serializers.CharField()),
+        required=True
+    )
+    model = serializers.CharField(required=False, default="llama-3.1-8b-instant")
+    extract_additional = serializers.BooleanField(required=False, default=True)
+    process_image = serializers.BooleanField(required=False, default=True)
+
+    def validate_model(self, value):
+        from django.conf import settings
+        if value not in settings.SUPPORTED_MODELS:
+            raise serializers.ValidationError(
+                f"Model must be one of {settings.SUPPORTED_MODELS}"
+            )
+        return value
+
+    def validate_products(self, value):
+        from django.conf import settings
+        max_size = getattr(settings, 'MAX_BATCH_SIZE', 100)
+        if len(value) > max_size:
+            raise serializers.ValidationError(
+                f"Batch size cannot exceed {max_size} products"
+            )
+        return value
+
+
+class OCRResultSerializer(serializers.Serializer):
+    """Serializer for OCR results."""
+    detected_text = serializers.ListField(child=serializers.DictField())
+    extracted_attributes = serializers.DictField()
+
+
+class ProductAttributeResultSerializer(serializers.Serializer):
+    """Serializer for individual product extraction result."""
+    product_id = serializers.CharField(required=False)
+    mandatory = serializers.DictField()
+    additional = serializers.DictField(required=False)
+    ocr_results = OCRResultSerializer(required=False)
+    error = serializers.CharField(required=False)
+    raw_output = serializers.CharField(required=False)
+
+
+class BatchProductResponseSerializer(serializers.Serializer):
+    """Serializer for batch extraction response."""
+    results = serializers.ListField(child=ProductAttributeResultSerializer())
+    total_products = serializers.IntegerField()
+    successful = serializers.IntegerField()
+    failed = serializers.IntegerField()
+
+
+
+
+from rest_framework import serializers
+from .models import Product
+
+class ProductSerializer(serializers.ModelSerializer):
+    class Meta:
+        model = Product
+        fields = [
+            'id',
+            'item_id',
+            'product_name',
+            'product_long_description',
+            'product_short_description',
+            'product_type',
+            'image_path',
+            'image',
+        ]

+ 942 - 0
attr_extraction/services.py

@@ -0,0 +1,942 @@
+# # import requests
+# # import json
+# # from typing import Dict, List, Optional
+# # from django.conf import settings
+
+
+# # class ProductAttributeService:
+# #     """Service class for extracting product attributes using Groq LLM."""
+
+# #     @staticmethod
+# #     def combine_product_text(
+# #         title: Optional[str] = None,
+# #         short_desc: Optional[str] = None,
+# #         long_desc: Optional[str] = None
+# #     ) -> str:
+# #         """Combine product metadata into a single text block."""
+# #         parts = []
+# #         if title:
+# #             parts.append(str(title).strip())
+# #         if short_desc:
+# #             parts.append(str(short_desc).strip())
+# #         if long_desc:
+# #             parts.append(str(long_desc).strip())
+
+# #         combined = " ".join(parts).strip()
+
+# #         if not combined:
+# #             return "No product information available"
+
+# #         return combined
+
+# #     @staticmethod
+# #     def extract_attributes(
+# #         product_text: str,
+# #         mandatory_attrs: Dict[str, List[str]],
+# #         model: str = None,
+# #         extract_additional: bool = True
+# #     ) -> dict:
+# #         """Use Groq LLM to extract attributes from any product type."""
+        
+# #         if model is None:
+# #             model = settings.SUPPORTED_MODELS[0]
+
+# #         # Check if product text is empty or minimal
+# #         if not product_text or product_text == "No product information available":
+# #             return ProductAttributeService._create_error_response(
+# #                 "No product information provided",
+# #                 mandatory_attrs,
+# #                 extract_additional
+# #             )
+
+# #         # Create structured prompt for mandatory attributes
+# #         mandatory_attr_list = []
+# #         for attr_name, allowed_values in mandatory_attrs.items():
+# #             mandatory_attr_list.append(f"{attr_name}: {', '.join(allowed_values)}")
+# #         mandatory_attr_text = "\n".join(mandatory_attr_list)
+
+# #         additional_instruction = ""
+# #         if extract_additional:
+# #             additional_instruction = """
+# # 2. Extract ADDITIONAL attributes: Identify any other relevant attributes from the product text 
+# #    (such as Material, Size, Color, Brand, Dimensions, Weight, Features, Specifications, etc.) 
+# #    and their values. Extract attributes that are specific and relevant to this product type."""
+
+# #         output_format = {
+# #             "mandatory": {attr: "value" for attr in mandatory_attrs.keys()},
+# #             "additional": {} if extract_additional else None
+# #         }
+
+# #         if not extract_additional:
+# #             output_format.pop("additional")
+
+# #         prompt = f"""
+# # You are an intelligent product attribute extractor that works with ANY product type.
+
+# # TASK:
+# # 1. Extract MANDATORY attributes: For each mandatory attribute, select the most appropriate value 
+# #    from the provided list. Choose the value that best matches the product description.
+# # {additional_instruction}
+
+# # Product Text:
+# # {product_text}
+
+# # Mandatory Attribute Lists (MUST select one value for each):
+# # {mandatory_attr_text}
+
+# # CRITICAL INSTRUCTIONS:
+# # - Return ONLY valid JSON, nothing else
+# # - No explanations, no markdown, no text before or after the JSON
+# # - For mandatory attributes, choose EXACTLY ONE value from the provided list that best matches
+# # - If a mandatory attribute cannot be determined from the product text, use "Not Specified"
+# # - Work with whatever information is available - the product text may be incomplete (only title, or only description, etc.)
+# # {f"- For additional attributes, extract any relevant information found in the product text" if extract_additional else ""}
+# # - Be precise and only extract information that is explicitly stated or clearly implied
+
+# # Required Output Format (ONLY THIS, NO OTHER TEXT):
+# # {json.dumps(output_format, indent=2)}
+# #         """
+
+# #         payload = {
+# #             "model": model,
+# #             "messages": [
+# #                 {
+# #                     "role": "system",
+# #                     "content": f"You are a precise attribute extraction model. Return ONLY valid JSON with {'mandatory and additional' if extract_additional else 'mandatory'} sections. No explanations, no markdown, no other text."
+# #                 },
+# #                 {"role": "user", "content": prompt}
+# #             ],
+# #             "temperature": 0.0,
+# #             "max_tokens": 1500
+# #         }
+
+# #         headers = {
+# #             "Authorization": f"Bearer {settings.GROQ_API_KEY}",
+# #             "Content-Type": "application/json",
+# #         }
+
+# #         try:
+# #             response = requests.post(
+# #                 settings.GROQ_API_URL,
+# #                 headers=headers,
+# #                 json=payload,
+# #                 timeout=30
+# #             )
+# #             response.raise_for_status()
+# #             result_text = response.json()["choices"][0]["message"]["content"].strip()
+
+# #             # Clean the response
+# #             result_text = ProductAttributeService._clean_json_response(result_text)
+
+# #             # Parse JSON
+# #             parsed = json.loads(result_text)
+
+# #             # Validate and restructure if needed
+# #             parsed = ProductAttributeService._validate_response_structure(
+# #                 parsed, mandatory_attrs, extract_additional
+# #             )
+
+# #             return parsed
+
+# #         except requests.exceptions.RequestException as e:
+# #             return ProductAttributeService._create_error_response(
+# #                 str(e), mandatory_attrs, extract_additional
+# #             )
+# #         except json.JSONDecodeError as e:
+# #             return ProductAttributeService._create_error_response(
+# #                 f"Invalid JSON: {str(e)}", mandatory_attrs, extract_additional, result_text
+# #             )
+# #         except Exception as e:
+# #             return ProductAttributeService._create_error_response(
+# #                 str(e), mandatory_attrs, extract_additional
+# #             )
+
+# #     @staticmethod
+# #     def _clean_json_response(text: str) -> str:
+# #         """Clean LLM response to extract valid JSON."""
+# #         start_idx = text.find('{')
+# #         end_idx = text.rfind('}')
+
+# #         if start_idx != -1 and end_idx != -1:
+# #             text = text[start_idx:end_idx + 1]
+
+# #         if "```json" in text:
+# #             text = text.split("```json")[1].split("```")[0].strip()
+# #         elif "```" in text:
+# #             text = text.split("```")[1].split("```")[0].strip()
+# #             if text.startswith("json"):
+# #                 text = text[4:].strip()
+
+# #         return text
+
+# #     @staticmethod
+# #     def _validate_response_structure(
+# #         parsed: dict,
+# #         mandatory_attrs: Dict[str, List[str]],
+# #         extract_additional: bool
+# #     ) -> dict:
+# #         """Validate and fix the response structure."""
+# #         expected_sections = ["mandatory"]
+# #         if extract_additional:
+# #             expected_sections.append("additional")
+
+# #         if not all(section in parsed for section in expected_sections):
+# #             if isinstance(parsed, dict):
+# #                 mandatory_keys = set(mandatory_attrs.keys())
+# #                 mandatory = {k: v for k, v in parsed.items() if k in mandatory_keys}
+# #                 additional = {k: v for k, v in parsed.items() if k not in mandatory_keys}
+
+# #                 result = {"mandatory": mandatory}
+# #                 if extract_additional:
+# #                     result["additional"] = additional
+# #                 return result
+# #             else:
+# #                 return ProductAttributeService._create_error_response(
+# #                     "Invalid response structure",
+# #                     mandatory_attrs,
+# #                     extract_additional,
+# #                     str(parsed)
+# #                 )
+
+# #         return parsed
+
+# #     @staticmethod
+# #     def _create_error_response(
+# #         error: str,
+# #         mandatory_attrs: Dict[str, List[str]],
+# #         extract_additional: bool,
+# #         raw_output: Optional[str] = None
+# #     ) -> dict:
+# #         """Create a standardized error response."""
+# #         response = {
+# #             "mandatory": {attr: "Not Specified" for attr in mandatory_attrs.keys()},
+# #             "error": error
+# #         }
+# #         if extract_additional:
+# #             response["additional"] = {}
+# #         if raw_output:
+# #             response["raw_output"] = raw_output
+# #         return response
+
+
+
+
+
+# import requests
+# import json
+# from typing import Dict, List, Optional
+# from django.conf import settings
+# from concurrent.futures import ThreadPoolExecutor, as_completed
+
+
+# class ProductAttributeService:
+#     """Service class for extracting product attributes using Groq LLM."""
+
+#     @staticmethod
+#     def combine_product_text(
+#         title: Optional[str] = None,
+#         short_desc: Optional[str] = None,
+#         long_desc: Optional[str] = None
+#     ) -> str:
+#         """Combine product metadata into a single text block."""
+#         parts = []
+#         if title:
+#             parts.append(str(title).strip())
+#         if short_desc:
+#             parts.append(str(short_desc).strip())
+#         if long_desc:
+#             parts.append(str(long_desc).strip())
+
+#         combined = " ".join(parts).strip()
+
+#         if not combined:
+#             return "No product information available"
+
+#         return combined
+
+#     @staticmethod
+#     def extract_attributes(
+#         product_text: str,
+#         mandatory_attrs: Dict[str, List[str]],
+#         model: str = None,
+#         extract_additional: bool = True
+#     ) -> dict:
+#         """Use Groq LLM to extract attributes from any product type."""
+        
+#         if model is None:
+#             model = settings.SUPPORTED_MODELS[0]
+
+#         # Check if product text is empty or minimal
+#         if not product_text or product_text == "No product information available":
+#             return ProductAttributeService._create_error_response(
+#                 "No product information provided",
+#                 mandatory_attrs,
+#                 extract_additional
+#             )
+
+#         # Create structured prompt for mandatory attributes
+#         mandatory_attr_list = []
+#         for attr_name, allowed_values in mandatory_attrs.items():
+#             mandatory_attr_list.append(f"{attr_name}: {', '.join(allowed_values)}")
+#         mandatory_attr_text = "\n".join(mandatory_attr_list)
+
+#         additional_instruction = ""
+#         if extract_additional:
+#             additional_instruction = """
+# 2. Extract ADDITIONAL attributes: Identify any other relevant attributes from the product text 
+#    (such as Material, Size, Color, Brand, Dimensions, Weight, Features, Specifications, etc.) 
+#    and their values. Extract attributes that are specific and relevant to this product type."""
+
+#         output_format = {
+#             "mandatory": {attr: "value" for attr in mandatory_attrs.keys()},
+#             "additional": {} if extract_additional else None
+#         }
+
+#         if not extract_additional:
+#             output_format.pop("additional")
+
+#         prompt = f"""
+# You are an intelligent product attribute extractor that works with ANY product type.
+
+# TASK:
+# 1. Extract MANDATORY attributes: For each mandatory attribute, select the most appropriate value 
+#    from the provided list. Choose the value that best matches the product description.
+# {additional_instruction}
+
+# Product Text:
+# {product_text}
+
+# Mandatory Attribute Lists (MUST select one value for each):
+# {mandatory_attr_text}
+
+# CRITICAL INSTRUCTIONS:
+# - Return ONLY valid JSON, nothing else
+# - No explanations, no markdown, no text before or after the JSON
+# - For mandatory attributes, choose EXACTLY ONE value from the provided list that best matches
+# - If a mandatory attribute cannot be determined from the product text, use "Not Specified"
+# - Work with whatever information is available - the product text may be incomplete (only title, or only description, etc.)
+# {f"- For additional attributes, extract any relevant information found in the product text" if extract_additional else ""}
+# - Be precise and only extract information that is explicitly stated or clearly implied
+
+# Required Output Format (ONLY THIS, NO OTHER TEXT):
+# {json.dumps(output_format, indent=2)}
+#         """
+
+#         payload = {
+#             "model": model,
+#             "messages": [
+#                 {
+#                     "role": "system",
+#                     "content": f"You are a precise attribute extraction model. Return ONLY valid JSON with {'mandatory and additional' if extract_additional else 'mandatory'} sections. No explanations, no markdown, no other text."
+#                 },
+#                 {"role": "user", "content": prompt}
+#             ],
+#             "temperature": 0.0,
+#             "max_tokens": 1500
+#         }
+
+#         headers = {
+#             "Authorization": f"Bearer {settings.GROQ_API_KEY}",
+#             "Content-Type": "application/json",
+#         }
+
+#         try:
+#             response = requests.post(
+#                 settings.GROQ_API_URL,
+#                 headers=headers,
+#                 json=payload,
+#                 timeout=30
+#             )
+#             response.raise_for_status()
+#             result_text = response.json()["choices"][0]["message"]["content"].strip()
+
+#             # Clean the response
+#             result_text = ProductAttributeService._clean_json_response(result_text)
+
+#             # Parse JSON
+#             parsed = json.loads(result_text)
+
+#             # Validate and restructure if needed
+#             parsed = ProductAttributeService._validate_response_structure(
+#                 parsed, mandatory_attrs, extract_additional
+#             )
+
+#             return parsed
+
+#         except requests.exceptions.RequestException as e:
+#             return ProductAttributeService._create_error_response(
+#                 str(e), mandatory_attrs, extract_additional
+#             )
+#         except json.JSONDecodeError as e:
+#             return ProductAttributeService._create_error_response(
+#                 f"Invalid JSON: {str(e)}", mandatory_attrs, extract_additional, result_text
+#             )
+#         except Exception as e:
+#             return ProductAttributeService._create_error_response(
+#                 str(e), mandatory_attrs, extract_additional
+#             )
+
+#     @staticmethod
+#     def extract_attributes_batch(
+#         products: List[Dict],
+#         mandatory_attrs: Dict[str, List[str]],
+#         model: str = None,
+#         extract_additional: bool = True,
+#         max_workers: int = 5
+#     ) -> Dict:
+#         """
+#         Extract attributes for multiple products in parallel.
+        
+#         Args:
+#             products: List of product dictionaries with keys: product_id, title, short_desc, long_desc
+#             mandatory_attrs: Dictionary of mandatory attributes
+#             model: Groq model to use
+#             extract_additional: Whether to extract additional attributes
+#             max_workers: Maximum number of parallel workers
+            
+#         Returns:
+#             Dictionary with results, total_products, successful, and failed counts
+#         """
+#         results = []
+#         successful = 0
+#         failed = 0
+
+#         def process_product(product_data):
+#             """Process a single product."""
+#             product_id = product_data.get('product_id', f"product_{len(results)}")
+            
+#             try:
+#                 product_text = ProductAttributeService.combine_product_text(
+#                     title=product_data.get('title'),
+#                     short_desc=product_data.get('short_desc'),
+#                     long_desc=product_data.get('long_desc')
+#                 )
+                
+#                 result = ProductAttributeService.extract_attributes(
+#                     product_text=product_text,
+#                     mandatory_attrs=mandatory_attrs,
+#                     model=model,
+#                     extract_additional=extract_additional
+#                 )
+                
+#                 result['product_id'] = product_id
+                
+#                 # Check if extraction was successful
+#                 if 'error' not in result:
+#                     return result, True
+#                 else:
+#                     return result, False
+                    
+#             except Exception as e:
+#                 return {
+#                     'product_id': product_id,
+#                     'mandatory': {attr: "Not Specified" for attr in mandatory_attrs.keys()},
+#                     'additional': {} if extract_additional else None,
+#                     'error': f"Processing error: {str(e)}"
+#                 }, False
+
+#         # Process products in parallel
+#         with ThreadPoolExecutor(max_workers=max_workers) as executor:
+#             future_to_product = {
+#                 executor.submit(process_product, product): product 
+#                 for product in products
+#             }
+            
+#             for future in as_completed(future_to_product):
+#                 try:
+#                     result, success = future.result()
+#                     results.append(result)
+#                     if success:
+#                         successful += 1
+#                     else:
+#                         failed += 1
+#                 except Exception as e:
+#                     failed += 1
+#                     results.append({
+#                         'product_id': 'unknown',
+#                         'mandatory': {attr: "Not Specified" for attr in mandatory_attrs.keys()},
+#                         'additional': {} if extract_additional else None,
+#                         'error': f"Unexpected error: {str(e)}"
+#                     })
+
+#         return {
+#             'results': results,
+#             'total_products': len(products),
+#             'successful': successful,
+#             'failed': failed
+#         }
+
+#     @staticmethod
+#     def _clean_json_response(text: str) -> str:
+#         """Clean LLM response to extract valid JSON."""
+#         start_idx = text.find('{')
+#         end_idx = text.rfind('}')
+
+#         if start_idx != -1 and end_idx != -1:
+#             text = text[start_idx:end_idx + 1]
+
+#         if "```json" in text:
+#             text = text.split("```json")[1].split("```")[0].strip()
+#         elif "```" in text:
+#             text = text.split("```")[1].split("```")[0].strip()
+#             if text.startswith("json"):
+#                 text = text[4:].strip()
+
+#         return text
+
+#     @staticmethod
+#     def _validate_response_structure(
+#         parsed: dict,
+#         mandatory_attrs: Dict[str, List[str]],
+#         extract_additional: bool
+#     ) -> dict:
+#         """Validate and fix the response structure."""
+#         expected_sections = ["mandatory"]
+#         if extract_additional:
+#             expected_sections.append("additional")
+
+#         if not all(section in parsed for section in expected_sections):
+#             if isinstance(parsed, dict):
+#                 mandatory_keys = set(mandatory_attrs.keys())
+#                 mandatory = {k: v for k, v in parsed.items() if k in mandatory_keys}
+#                 additional = {k: v for k, v in parsed.items() if k not in mandatory_keys}
+
+#                 result = {"mandatory": mandatory}
+#                 if extract_additional:
+#                     result["additional"] = additional
+#                 return result
+#             else:
+#                 return ProductAttributeService._create_error_response(
+#                     "Invalid response structure",
+#                     mandatory_attrs,
+#                     extract_additional,
+#                     str(parsed)
+#                 )
+
+#         return parsed
+
+#     @staticmethod
+#     def _create_error_response(
+#         error: str,
+#         mandatory_attrs: Dict[str, List[str]],
+#         extract_additional: bool,
+#         raw_output: Optional[str] = None
+#     ) -> dict:
+#         """Create a standardized error response."""
+#         response = {
+#             "mandatory": {attr: "Not Specified" for attr in mandatory_attrs.keys()},
+#             "error": error
+#         }
+#         if extract_additional:
+#             response["additional"] = {}
+#         if raw_output:
+#             response["raw_output"] = raw_output
+#         return response
+
+
+
+
+
+
+# ==================== services.py ====================
+import requests
+import json
+from typing import Dict, List, Optional
+from django.conf import settings
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from .ocr_service import OCRService
+
+
+class ProductAttributeService:
+    """Service class for extracting product attributes using Groq LLM."""
+
+    @staticmethod
+    def combine_product_text(
+        title: Optional[str] = None,
+        short_desc: Optional[str] = None,
+        long_desc: Optional[str] = None,
+        ocr_text: Optional[str] = None
+    ) -> str:
+        """Combine product metadata into a single text block."""
+        parts = []
+        if title:
+            parts.append(f"Title: {str(title).strip()}")
+        if short_desc:
+            parts.append(f"Description: {str(short_desc).strip()}")
+        if long_desc:
+            parts.append(f"Details: {str(long_desc).strip()}")
+        if ocr_text:
+            parts.append(f"OCR Text: {ocr_text}")
+        
+        combined = "\n".join(parts).strip()
+        
+        if not combined:
+            return "No product information available"
+        
+        return combined
+
+    @staticmethod
+    def extract_attributes_from_ocr(ocr_results: Dict, model: str = None) -> Dict:
+        """Extract structured attributes from OCR text using LLM."""
+        if model is None:
+            model = settings.SUPPORTED_MODELS[0]
+        
+        detected_text = ocr_results.get('detected_text', [])
+        if not detected_text:
+            return {}
+        
+        # Format OCR text for prompt
+        ocr_text = "\n".join([f"Text: {item['text']}, Confidence: {item['confidence']:.2f}" 
+                              for item in detected_text])
+        
+        prompt = f"""
+You are an AI model that extracts structured attributes from OCR text detected on product images.
+Given the OCR detections below, infer the possible product attributes and return them as a clean JSON object.
+
+OCR Text:
+{ocr_text}
+
+Extract relevant attributes like:
+- brand
+- model_number
+- size (waist_size, length, etc.)
+- collection
+- any other relevant product information
+
+Return a JSON object with only the attributes you can confidently identify.
+If an attribute is not present, do not include it in the response.
+"""
+        
+        payload = {
+            "model": model,
+            "messages": [
+                {
+                    "role": "system",
+                    "content": "You are a helpful AI that extracts structured data from OCR output. Return only valid JSON."
+                },
+                {"role": "user", "content": prompt}
+            ],
+            "temperature": 0.2,
+            "max_tokens": 500
+        }
+        
+        headers = {
+            "Authorization": f"Bearer {settings.GROQ_API_KEY}",
+            "Content-Type": "application/json",
+        }
+        
+        try:
+            response = requests.post(
+                settings.GROQ_API_URL,
+                headers=headers,
+                json=payload,
+                timeout=30
+            )
+            response.raise_for_status()
+            result_text = response.json()["choices"][0]["message"]["content"].strip()
+            
+            # Clean and parse JSON
+            result_text = ProductAttributeService._clean_json_response(result_text)
+            parsed = json.loads(result_text)
+            
+            return parsed
+        except Exception as e:
+            return {"error": f"Failed to extract attributes from OCR: {str(e)}"}
+
+    @staticmethod
+    def extract_attributes(
+        product_text: str,
+        mandatory_attrs: Dict[str, List[str]],
+        model: str = None,
+        extract_additional: bool = True
+    ) -> dict:
+        """Use Groq LLM to extract attributes from any product type."""
+        
+        if model is None:
+            model = settings.SUPPORTED_MODELS[0]
+
+        # Check if product text is empty or minimal
+        if not product_text or product_text == "No product information available":
+            return ProductAttributeService._create_error_response(
+                "No product information provided",
+                mandatory_attrs,
+                extract_additional
+            )
+
+        # Create structured prompt for mandatory attributes
+        mandatory_attr_list = []
+        for attr_name, allowed_values in mandatory_attrs.items():
+            mandatory_attr_list.append(f"{attr_name}: {', '.join(allowed_values)}")
+        mandatory_attr_text = "\n".join(mandatory_attr_list)
+
+        additional_instruction = ""
+        if extract_additional:
+            additional_instruction = """
+2. Extract ADDITIONAL attributes: Identify any other relevant attributes from the product text 
+   (such as Material, Size, Color, Brand, Dimensions, Weight, Features, Specifications, etc.) 
+   and their values. Extract attributes that are specific and relevant to this product type."""
+
+        output_format = {
+            "mandatory": {attr: "value" for attr in mandatory_attrs.keys()},
+            "additional": {} if extract_additional else None
+        }
+
+        if not extract_additional:
+            output_format.pop("additional")
+
+        prompt = f"""
+You are an intelligent product attribute extractor that works with ANY product type.
+
+TASK:
+1. Extract MANDATORY attributes: For each mandatory attribute, select the most appropriate value 
+   from the provided list. Choose the value that best matches the product description.
+{additional_instruction}
+
+Product Text:
+{product_text}
+
+Mandatory Attribute Lists (MUST select one value for each):
+{mandatory_attr_text}
+
+CRITICAL INSTRUCTIONS:
+- Return ONLY valid JSON, nothing else
+- No explanations, no markdown, no text before or after the JSON
+- For mandatory attributes, choose EXACTLY ONE value from the provided list that best matches
+- If a mandatory attribute cannot be determined from the product text, use "Not Specified"
+- Work with whatever information is available - the product text may be incomplete
+{f"- For additional attributes, extract any relevant information found in the product text" if extract_additional else ""}
+- Be precise and only extract information that is explicitly stated or clearly implied
+
+Required Output Format (ONLY THIS, NO OTHER TEXT):
+{json.dumps(output_format, indent=2)}
+        """
+
+        payload = {
+            "model": model,
+            "messages": [
+                {
+                    "role": "system",
+                    "content": f"You are a precise attribute extraction model. Return ONLY valid JSON with {'mandatory and additional' if extract_additional else 'mandatory'} sections. No explanations, no markdown, no other text."
+                },
+                {"role": "user", "content": prompt}
+            ],
+            "temperature": 0.0,
+            "max_tokens": 1500
+        }
+
+        headers = {
+            "Authorization": f"Bearer {settings.GROQ_API_KEY}",
+            "Content-Type": "application/json",
+        }
+
+        try:
+            response = requests.post(
+                settings.GROQ_API_URL,
+                headers=headers,
+                json=payload,
+                timeout=30
+            )
+            response.raise_for_status()
+            result_text = response.json()["choices"][0]["message"]["content"].strip()
+
+            # Clean the response
+            result_text = ProductAttributeService._clean_json_response(result_text)
+
+            # Parse JSON
+            parsed = json.loads(result_text)
+
+            # Validate and restructure if needed
+            parsed = ProductAttributeService._validate_response_structure(
+                parsed, mandatory_attrs, extract_additional
+            )
+
+            return parsed
+
+        except requests.exceptions.RequestException as e:
+            return ProductAttributeService._create_error_response(
+                str(e), mandatory_attrs, extract_additional
+            )
+        except json.JSONDecodeError as e:
+            return ProductAttributeService._create_error_response(
+                f"Invalid JSON: {str(e)}", mandatory_attrs, extract_additional, result_text
+            )
+        except Exception as e:
+            return ProductAttributeService._create_error_response(
+                str(e), mandatory_attrs, extract_additional
+            )
+
+    @staticmethod
+    def extract_attributes_batch(
+        products: List[Dict],
+        mandatory_attrs: Dict[str, List[str]],
+        model: str = None,
+        extract_additional: bool = True,
+        process_image: bool = True,
+        max_workers: int = 5
+    ) -> Dict:
+        """Extract attributes for multiple products in parallel."""
+        results = []
+        successful = 0
+        failed = 0
+        
+        ocr_service = OCRService()
+
+        def process_product(product_data):
+            """Process a single product."""
+            product_id = product_data.get('product_id', f"product_{len(results)}")
+            
+            try:
+                # Process image if URL is provided
+                ocr_results = None
+                ocr_text = None
+                
+                if process_image and product_data.get('image_url'):
+                    ocr_results = ocr_service.process_image(product_data['image_url'])
+                    
+                    # Extract attributes from OCR
+                    if ocr_results and ocr_results.get('detected_text'):
+                        ocr_attrs = ProductAttributeService.extract_attributes_from_ocr(
+                            ocr_results, model
+                        )
+                        ocr_results['extracted_attributes'] = ocr_attrs
+                        
+                        # Format OCR text for combining with product text
+                        ocr_text = "\n".join([
+                            f"{item['text']} (confidence: {item['confidence']:.2f})"
+                            for item in ocr_results['detected_text']
+                        ])
+                
+                # Combine all product information
+                product_text = ProductAttributeService.combine_product_text(
+                    title=product_data.get('title'),
+                    short_desc=product_data.get('short_desc'),
+                    long_desc=product_data.get('long_desc'),
+                    ocr_text=ocr_text
+                )
+                
+                # Extract attributes from combined text
+                result = ProductAttributeService.extract_attributes(
+                    product_text=product_text,
+                    mandatory_attrs=mandatory_attrs,
+                    model=model,
+                    extract_additional=extract_additional
+                )
+                
+                result['product_id'] = product_id
+                
+                # Add OCR results if available
+                if ocr_results:
+                    result['ocr_results'] = ocr_results
+                
+                # Check if extraction was successful
+                if 'error' not in result:
+                    return result, True
+                else:
+                    return result, False
+                    
+            except Exception as e:
+                return {
+                    'product_id': product_id,
+                    'mandatory': {attr: "Not Specified" for attr in mandatory_attrs.keys()},
+                    'additional': {} if extract_additional else None,
+                    'error': f"Processing error: {str(e)}"
+                }, False
+
+        # Process products in parallel
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            future_to_product = {
+                executor.submit(process_product, product): product 
+                for product in products
+            }
+            
+            for future in as_completed(future_to_product):
+                try:
+                    result, success = future.result()
+                    results.append(result)
+                    if success:
+                        successful += 1
+                    else:
+                        failed += 1
+                except Exception as e:
+                    failed += 1
+                    results.append({
+                        'product_id': 'unknown',
+                        'mandatory': {attr: "Not Specified" for attr in mandatory_attrs.keys()},
+                        'additional': {} if extract_additional else None,
+                        'error': f"Unexpected error: {str(e)}"
+                    })
+
+        return {
+            'results': results,
+            'total_products': len(products),
+            'successful': successful,
+            'failed': failed
+        }
+
+    @staticmethod
+    def _clean_json_response(text: str) -> str:
+        """Clean LLM response to extract valid JSON."""
+        start_idx = text.find('{')
+        end_idx = text.rfind('}')
+
+        if start_idx != -1 and end_idx != -1:
+            text = text[start_idx:end_idx + 1]
+
+        if "```json" in text:
+            text = text.split("```json")[1].split("```")[0].strip()
+        elif "```" in text:
+            text = text.split("```")[1].split("```")[0].strip()
+            if text.startswith("json"):
+                text = text[4:].strip()
+
+        return text
+
+    @staticmethod
+    def _validate_response_structure(
+        parsed: dict,
+        mandatory_attrs: Dict[str, List[str]],
+        extract_additional: bool
+    ) -> dict:
+        """Validate and fix the response structure."""
+        expected_sections = ["mandatory"]
+        if extract_additional:
+            expected_sections.append("additional")
+
+        if not all(section in parsed for section in expected_sections):
+            if isinstance(parsed, dict):
+                mandatory_keys = set(mandatory_attrs.keys())
+                mandatory = {k: v for k, v in parsed.items() if k in mandatory_keys}
+                additional = {k: v for k, v in parsed.items() if k not in mandatory_keys}
+
+                result = {"mandatory": mandatory}
+                if extract_additional:
+                    result["additional"] = additional
+                return result
+            else:
+                return ProductAttributeService._create_error_response(
+                    "Invalid response structure",
+                    mandatory_attrs,
+                    extract_additional,
+                    str(parsed)
+                )
+
+        return parsed
+
+    @staticmethod
+    def _create_error_response(
+        error: str,
+        mandatory_attrs: Dict[str, List[str]],
+        extract_additional: bool,
+        raw_output: Optional[str] = None
+    ) -> dict:
+        """Create a standardized error response."""
+        response = {
+            "mandatory": {attr: "Not Specified" for attr in mandatory_attrs.keys()},
+            "error": error
+        }
+        if extract_additional:
+            response["additional"] = {}
+        if raw_output:
+            response["raw_output"] = raw_output
+        return response
+

+ 3 - 0
attr_extraction/tests.py

@@ -0,0 +1,3 @@
+from django.test import TestCase
+
+# Create your tests here.

+ 9 - 0
attr_extraction/urls.py

@@ -0,0 +1,9 @@
+# ==================== urls.py ====================
+from django.urls import path
+from .views import ExtractProductAttributesView, BatchExtractProductAttributesView, ProductListView
+
+urlpatterns = [
+    path('extract/', ExtractProductAttributesView.as_view(), name='extract-attributes'),
+    path('batch-extract/', BatchExtractProductAttributesView.as_view(), name='batch-extract-attributes'),
+    path('products/', ProductListView.as_view(), name='batch-extract-attributes'),
+]

+ 352 - 0
attr_extraction/views.py

@@ -0,0 +1,352 @@
+# #  #==================== views.py ====================
+# # from rest_framework.views import APIView
+# # from rest_framework.response import Response
+# # from rest_framework import status
+# # from .serializers import (
+# #     ProductAttributeRequestSerializer,
+# #     ProductAttributeResponseSerializer
+# # )
+# # from .services import ProductAttributeService
+
+
+# # class ExtractProductAttributesView(APIView):
+# #     """
+# #     API endpoint to extract product attributes using Groq LLM.
+    
+# #     POST /api/extract-attributes/
+    
+# #     Request Body:
+# #     {
+# #         "title": "Product title (optional)",
+# #         "short_desc": "Short description (optional)",
+# #         "long_desc": "Long description (optional)",
+# #         "mandatory_attrs": {
+# #             "Attribute1": ["value1", "value2", "value3"],
+# #             "Attribute2": ["valueA", "valueB"]
+# #         },
+# #         "model": "llama-3.1-8b-instant (optional)",
+# #         "extract_additional": true (optional, default: true)
+# #     }
+    
+# #     Response:
+# #     {
+# #         "mandatory": {
+# #             "Attribute1": "value1",
+# #             "Attribute2": "valueA"
+# #         },
+# #         "additional": {
+# #             "Color": "Blue",
+# #             "Brand": "Example"
+# #         }
+# #     }
+# #     """
+
+# #     def post(self, request):
+# #         # Validate request data
+# #         serializer = ProductAttributeRequestSerializer(data=request.data)
+# #         if not serializer.is_valid():
+# #             return Response(
+# #                 {"error": serializer.errors},
+# #                 status=status.HTTP_400_BAD_REQUEST
+# #             )
+
+# #         validated_data = serializer.validated_data
+
+# #         # Combine product text
+# #         product_text = ProductAttributeService.combine_product_text(
+# #             title=validated_data.get('title'),
+# #             short_desc=validated_data.get('short_desc'),
+# #             long_desc=validated_data.get('long_desc')
+# #         )
+
+# #         # Extract attributes
+# #         result = ProductAttributeService.extract_attributes(
+# #             product_text=product_text,
+# #             mandatory_attrs=validated_data['mandatory_attrs'],
+# #             model=validated_data.get('model'),
+# #             extract_additional=validated_data.get('extract_additional', True)
+# #         )
+
+# #         # Return response
+# #         response_serializer = ProductAttributeResponseSerializer(data=result)
+# #         if response_serializer.is_valid():
+# #             return Response(response_serializer.data, status=status.HTTP_200_OK)
+        
+# #         return Response(result, status=status.HTTP_200_OK)
+
+
+
+
+
+
+
+# from rest_framework.views import APIView
+# from rest_framework.response import Response
+# from rest_framework import status
+# from .serializers import (
+#     SingleProductRequestSerializer,
+#     BatchProductRequestSerializer,
+#     ProductAttributeResultSerializer,
+#     BatchProductResponseSerializer
+# )
+# from .services import ProductAttributeService
+
+
+# class ExtractProductAttributesView(APIView):
+#     """
+#     API endpoint to extract product attributes for a single product.
+    
+#     POST /api/extract-attributes/
+    
+#     Request Body:
+#     {
+#         "title": "Product title (optional)",
+#         "short_desc": "Short description (optional)",
+#         "long_desc": "Long description (optional)",
+#         "mandatory_attrs": {
+#             "Attribute1": ["value1", "value2", "value3"],
+#             "Attribute2": ["valueA", "valueB"]
+#         },
+#         "model": "llama-3.1-8b-instant (optional)",
+#         "extract_additional": true (optional, default: true)
+#     }
+#     """
+
+#     def post(self, request):
+#         serializer = SingleProductRequestSerializer(data=request.data)
+#         if not serializer.is_valid():
+#             return Response(
+#                 {"error": serializer.errors},
+#                 status=status.HTTP_400_BAD_REQUEST
+#             )
+
+#         validated_data = serializer.validated_data
+
+#         product_text = ProductAttributeService.combine_product_text(
+#             title=validated_data.get('title'),
+#             short_desc=validated_data.get('short_desc'),
+#             long_desc=validated_data.get('long_desc')
+#         )
+
+#         result = ProductAttributeService.extract_attributes(
+#             product_text=product_text,
+#             mandatory_attrs=validated_data['mandatory_attrs'],
+#             model=validated_data.get('model'),
+#             extract_additional=validated_data.get('extract_additional', True)
+#         )
+
+#         response_serializer = ProductAttributeResultSerializer(data=result)
+#         if response_serializer.is_valid():
+#             return Response(response_serializer.data, status=status.HTTP_200_OK)
+        
+#         return Response(result, status=status.HTTP_200_OK)
+
+
+# class BatchExtractProductAttributesView(APIView):
+#     """
+#     API endpoint to extract product attributes for multiple products in batch.
+    
+#     POST /api/batch-extract-attributes/
+    
+#     Request Body:
+#     {
+#         "products": [
+#             {
+#                 "product_id": "prod_001",
+#                 "title": "Product 1 title",
+#                 "short_desc": "Short description",
+#                 "long_desc": "Long description"
+#             },
+#             {
+#                 "product_id": "prod_002",
+#                 "title": "Product 2 title",
+#                 "short_desc": "Short description"
+#             }
+#         ],
+#         "mandatory_attrs": {
+#             "Attribute1": ["value1", "value2", "value3"],
+#             "Attribute2": ["valueA", "valueB"]
+#         },
+#         "model": "llama-3.1-8b-instant (optional)",
+#         "extract_additional": true (optional, default: true)
+#     }
+    
+#     Response:
+#     {
+#         "results": [
+#             {
+#                 "product_id": "prod_001",
+#                 "mandatory": {...},
+#                 "additional": {...}
+#             },
+#             {
+#                 "product_id": "prod_002",
+#                 "mandatory": {...},
+#                 "additional": {...}
+#             }
+#         ],
+#         "total_products": 2,
+#         "successful": 2,
+#         "failed": 0
+#     }
+#     """
+
+#     def post(self, request):
+#         serializer = BatchProductRequestSerializer(data=request.data)
+#         if not serializer.is_valid():
+#             return Response(
+#                 {"error": serializer.errors},
+#                 status=status.HTTP_400_BAD_REQUEST
+#             )
+
+#         validated_data = serializer.validated_data
+
+#         # Extract attributes for all products in batch
+#         result = ProductAttributeService.extract_attributes_batch(
+#             products=validated_data['products'],
+#             mandatory_attrs=validated_data['mandatory_attrs'],
+#             model=validated_data.get('model'),
+#             extract_additional=validated_data.get('extract_additional', True)
+#         )
+
+#         response_serializer = BatchProductResponseSerializer(data=result)
+#         if response_serializer.is_valid():
+#             return Response(response_serializer.data, status=status.HTTP_200_OK)
+        
+#         return Response(result, status=status.HTTP_200_OK)
+
+
+
+
+
+
+
+# ==================== views.py ====================
+from rest_framework.views import APIView
+from rest_framework.response import Response
+from rest_framework import status
+from .serializers import (
+    SingleProductRequestSerializer,
+    BatchProductRequestSerializer,
+    ProductAttributeResultSerializer,
+    BatchProductResponseSerializer
+)
+from .services import ProductAttributeService
+from .ocr_service import OCRService
+
+
+class ExtractProductAttributesView(APIView):
+    """
+    API endpoint to extract product attributes for a single product.
+    Now supports image URL for OCR-based text extraction.
+    """
+
+    def post(self, request):
+        serializer = SingleProductRequestSerializer(data=request.data)
+        if not serializer.is_valid():
+            return Response(
+                {"error": serializer.errors},
+                status=status.HTTP_400_BAD_REQUEST
+            )
+
+        validated_data = serializer.validated_data
+        
+        # Process image if URL provided
+        ocr_results = None
+        ocr_text = None
+        
+        if validated_data.get('process_image', True) and validated_data.get('image_url'):
+            ocr_service = OCRService()
+            ocr_results = ocr_service.process_image(validated_data['image_url'])
+            
+            # Extract attributes from OCR
+            if ocr_results and ocr_results.get('detected_text'):
+                ocr_attrs = ProductAttributeService.extract_attributes_from_ocr(
+                    ocr_results,
+                    validated_data.get('model')
+                )
+                ocr_results['extracted_attributes'] = ocr_attrs
+                
+                # Format OCR text
+                ocr_text = "\n".join([
+                    f"{item['text']} (confidence: {item['confidence']:.2f})"
+                    for item in ocr_results['detected_text']
+                ])
+
+        # Combine all product information
+        product_text = ProductAttributeService.combine_product_text(
+            title=validated_data.get('title'),
+            short_desc=validated_data.get('short_desc'),
+            long_desc=validated_data.get('long_desc'),
+            ocr_text=ocr_text
+        )
+
+        # Extract attributes
+        result = ProductAttributeService.extract_attributes(
+            product_text=product_text,
+            mandatory_attrs=validated_data['mandatory_attrs'],
+            model=validated_data.get('model'),
+            extract_additional=validated_data.get('extract_additional', True)
+        )
+        
+        # Add OCR results if available
+        if ocr_results:
+            result['ocr_results'] = ocr_results
+
+        response_serializer = ProductAttributeResultSerializer(data=result)
+        if response_serializer.is_valid():
+            return Response(response_serializer.data, status=status.HTTP_200_OK)
+        
+        return Response(result, status=status.HTTP_200_OK)
+
+
+class BatchExtractProductAttributesView(APIView):
+    """
+    API endpoint to extract product attributes for multiple products in batch.
+    Now supports image URLs for OCR-based text extraction.
+    """
+
+    def post(self, request):
+        serializer = BatchProductRequestSerializer(data=request.data)
+        if not serializer.is_valid():
+            return Response(
+                {"error": serializer.errors},
+                status=status.HTTP_400_BAD_REQUEST
+            )
+
+        validated_data = serializer.validated_data
+
+        # Extract attributes for all products in batch
+        result = ProductAttributeService.extract_attributes_batch(
+            products=validated_data['products'],
+            mandatory_attrs=validated_data['mandatory_attrs'],
+            model=validated_data.get('model'),
+            extract_additional=validated_data.get('extract_additional', True),
+            process_image=validated_data.get('process_image', True)
+        )
+
+        response_serializer = BatchProductResponseSerializer(data=result)
+        if response_serializer.is_valid():
+            return Response(response_serializer.data, status=status.HTTP_200_OK)
+        
+        return Response(result, status=status.HTTP_200_OK)
+
+
+
+
+
+from rest_framework.views import APIView
+from rest_framework.response import Response
+from rest_framework import status
+from .models import Product
+from .serializers import ProductSerializer
+
+class ProductListView(APIView):
+    """
+    GET API to list all products with details
+    """
+    def get(self, request):
+        products = Product.objects.all()
+        serializer = ProductSerializer(products, many=True)
+        return Response(serializer.data, status=status.HTTP_200_OK)
+

BIN
content_quality_tool/__pycache__/settings.cpython-313.pyc


BIN
content_quality_tool/__pycache__/urls.cpython-313.pyc


+ 5 - 0
content_quality_tool/settings.py

@@ -29,6 +29,7 @@ INSTALLED_APPS = [
     'django.contrib.staticfiles',
     'core',
     'rest_framework',
+    'attr_extraction',
 ]
 MIDDLEWARE = [
     'django.middleware.security.SecurityMiddleware',
@@ -118,3 +119,7 @@ MESSAGE_TAGS = {
 
 
 
+GROQ_API_KEY = "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"
+GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
+SUPPORTED_MODELS = ["llama-3.1-8b-instant", "llama-3.3-70b-versatile", "mixtral-8x7b-32768"]
+MAX_BATCH_SIZE = 100  # Maximum products per batch request

+ 1 - 0
content_quality_tool/urls.py

@@ -29,6 +29,7 @@ urlpatterns = [
 
     # api url
     path("core/", include("core.urls")),
+    path("attr/", include("attr_extraction.urls")),
     # path("", views.login_view, name="login_view"),
 ]
 

BIN
core/__pycache__/models.cpython-313.pyc


BIN
core/services/__pycache__/attribute_scorer.cpython-313.pyc