소스 검색

optimized code

Harshit Pathak 3 달 전
부모
커밋
780e17ac26

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 884 - 1631
attr_extraction/services.py


+ 375 - 989
attr_extraction/views.py

@@ -132,11 +132,14 @@ class ExtractProductAttributesView(APIView):
         return Response(result, status=status.HTTP_200_OK)
 
 
+
+# Replace the BatchExtractProductAttributesView in your views.py with this updated version
+
 # class BatchExtractProductAttributesView(APIView):
 #     """
 #     API endpoint to extract product attributes for multiple products in batch.
 #     Uses item-specific mandatory_attrs with source tracking.
-#     Returns attributes in array format: [{"value": "...", "source": "..."}]
+#     Returns attributes in array format with original_value field.
 #     Includes OCR and Visual Processing results.
 #     """
 
@@ -147,15 +150,6 @@ class ExtractProductAttributesView(APIView):
 
 #         validated_data = serializer.validated_data
         
-#         # DEBUG: Print what we received
-#         print("\n" + "="*80)
-#         print("BATCH REQUEST - RECEIVED DATA")
-#         print("="*80)
-#         print(f"Raw request data keys: {request.data.keys()}")
-#         print(f"Multiple field in request: {request.data.get('multiple')}")
-#         print(f"Validated multiple field: {validated_data.get('multiple')}")
-#         print("="*80 + "\n")
-        
 #         # Get batch-level settings
 #         product_list = validated_data.get("products", [])
 #         model = validated_data.get("model")
@@ -168,27 +162,33 @@ class ExtractProductAttributesView(APIView):
 #         use_adaptive_margin = validated_data.get("use_adaptive_margin", True)
 #         use_semantic_clustering = validated_data.get("use_semantic_clustering", True)
         
-#         # DEBUG: Print extracted settings
-#         print(f"Extracted multiple parameter: {multiple}")
-#         print(f"Type: {type(multiple)}")
-        
 #         # Extract all item_ids to query the database efficiently
 #         item_ids = [p['item_id'] for p in product_list] 
         
 #         # Fetch all products in one query
 #         products_queryset = Product.objects.filter(item_id__in=item_ids)
-        
-#         # Create a dictionary for easy lookup: item_id -> Product object
 #         product_map = {product.item_id: product for product in products_queryset}
 #         found_ids = set(product_map.keys())
         
+#         # Fetch all original attribute values for these products in one query
+#         original_values_qs = ProductAttributeValue.objects.filter(
+#             product__item_id__in=item_ids
+#         ).select_related('product')
+        
+#         # Create a nested dictionary: {item_id: {attribute_name: original_value}}
+#         original_values_map = {}
+#         for attr_val in original_values_qs:
+#             item_id = attr_val.product.item_id
+#             if item_id not in original_values_map:
+#                 original_values_map[item_id] = {}
+#             original_values_map[item_id][attr_val.attribute_name] = attr_val.original_value
+        
 #         results = []
 #         successful = 0
 #         failed = 0
 
 #         for product_entry in product_list:
 #             item_id = product_entry['item_id']
-#             # Get item-specific mandatory attributes
 #             mandatory_attrs = product_entry['mandatory_attrs'] 
 
 #             if item_id not in found_ids:
@@ -206,7 +206,7 @@ class ExtractProductAttributesView(APIView):
 #                 short_desc = product.product_short_description
 #                 long_desc = product.product_long_description
 #                 image_url = product.image_path
-#                 # image_url = "https://images.unsplash.com/photo-1595777457583-95e059d581b8"
+                
 #                 ocr_results = None
 #                 ocr_text = None
 #                 visual_results = None
@@ -216,7 +216,6 @@ class ExtractProductAttributesView(APIView):
 #                     # OCR Processing
 #                     ocr_service = OCRService()
 #                     ocr_results = ocr_service.process_image(image_url)
-#                     print(f"OCR results for {item_id}: {ocr_results}")
                     
 #                     if ocr_results and ocr_results.get("detected_text"):
 #                         ocr_attrs = ProductAttributeService.extract_attributes_from_ocr(
@@ -232,7 +231,6 @@ class ExtractProductAttributesView(APIView):
 #                     visual_service = VisualProcessingService()
 #                     product_type_hint = product.product_type if hasattr(product, 'product_type') else None
 #                     visual_results = visual_service.process_image(image_url, product_type_hint)
-#                     print(f"Visual results for {item_id}: {visual_results.get('visual_attributes', {})}")
                     
 #                     # Format visual attributes to array format with source tracking
 #                     if visual_results and visual_results.get('visual_attributes'):
@@ -248,10 +246,6 @@ class ExtractProductAttributesView(APIView):
 #                     ocr_text=ocr_text
 #                 )
 
-#                 # DEBUG: Print before extraction
-#                 print(f"\n>>> Extracting for product {item_id}")
-#                 print(f"    Passing multiple: {multiple}")
-
 #                 # Attribute Extraction with source tracking (returns array format)
 #                 extracted = ProductAttributeService.extract_attributes(
 #                     product_text=product_text,
@@ -267,6 +261,25 @@ class ExtractProductAttributesView(APIView):
 #                     use_semantic_clustering=use_semantic_clustering
 #                 )
 
+#                 # Add original_value to each extracted attribute
+#                 original_attrs = original_values_map.get(item_id, {})
+                
+#                 # Process mandatory attributes
+#                 for attr_name, attr_values in extracted.get("mandatory", {}).items():
+#                     if isinstance(attr_values, list):
+#                         for attr_obj in attr_values:
+#                             if isinstance(attr_obj, dict):
+#                                 # Add original_value if it exists
+#                                 attr_obj["original_value"] = original_attrs.get(attr_name, "")
+                
+#                 # Process additional attributes
+#                 for attr_name, attr_values in extracted.get("additional", {}).items():
+#                     if isinstance(attr_values, list):
+#                         for attr_obj in attr_values:
+#                             if isinstance(attr_obj, dict):
+#                                 # Add original_value if it exists
+#                                 attr_obj["original_value"] = original_attrs.get(attr_name, "")
+
 #                 result = {
 #                     "product_id": product.item_id,
 #                     "mandatory": extracted.get("mandatory", {}),
@@ -305,51 +318,244 @@ class ExtractProductAttributesView(APIView):
 #         return Response(batch_result, status=status.HTTP_200_OK)
 
 
+# views.py - OPTIMIZED WITHOUT REDIS/CELERY
 
+# class BatchExtractProductAttributesView(APIView):
+#     """
+#     Optimized batch extraction using ThreadPoolExecutor (built-in Python)
+#     """
 
-# Replace the BatchExtractProductAttributesView in your views.py with this updated version
+#     def post(self, request):
+#         serializer = BatchProductRequestSerializer(data=request.data)
+#         if not serializer.is_valid():
+#             return Response({"error": serializer.errors}, status=status.HTTP_400_BAD_REQUEST)
+
+#         validated_data = serializer.validated_data
+#         product_list = validated_data.get("products", [])
+        
+#         # OPTIMIZATION 1: Single optimized database query
+#         item_ids = [p['item_id'] for p in product_list]
+#         products_queryset = Product.objects.filter(
+#             item_id__in=item_ids
+#         ).prefetch_related('attribute_values')  # Single query!
+        
+#         product_map = {product.item_id: product for product in products_queryset}
+        
+#         # OPTIMIZATION 2: Prefetch ALL original attribute values in ONE query
+#         original_values_qs = ProductAttributeValue.objects.filter(
+#             product__item_id__in=item_ids
+#         ).select_related('product')
+        
+#         original_values_map = {}
+#         for attr_val in original_values_qs:
+#             item_id = attr_val.product.item_id
+#             if item_id not in original_values_map:
+#                 original_values_map[item_id] = {}
+#             original_values_map[item_id][attr_val.attribute_name] = attr_val.original_value
+        
+#         # Extract settings
+#         model = validated_data.get("model")
+#         extract_additional = validated_data.get("extract_additional", True)
+#         process_image = validated_data.get("process_image", True)
+#         multiple = validated_data.get("multiple", [])
+#         threshold_abs = validated_data.get("threshold_abs", 0.65)
+#         margin = validated_data.get("margin", 0.15)
+#         use_dynamic_thresholds = validated_data.get("use_dynamic_thresholds", True)
+#         use_adaptive_margin = validated_data.get("use_adaptive_margin", True)
+#         use_semantic_clustering = validated_data.get("use_semantic_clustering", True)
+        
+#         results = []
+#         successful = 0
+#         failed = 0
+        
+#         # OPTIMIZATION 3: Initialize services once
+#         ocr_service = OCRService() if process_image else None
+#         visual_service = VisualProcessingService() if process_image else None
+
+#         # OPTIMIZATION 4: Process in parallel using ThreadPoolExecutor
+#         def process_single_product(product_entry):
+#             """Process a single product (runs in parallel)"""
+#             item_id = product_entry['item_id']
+#             mandatory_attrs = product_entry['mandatory_attrs']
+
+#             if item_id not in product_map:
+#                 return {
+#                     "product_id": item_id,
+#                     "error": "Product not found in database"
+#                 }, False
+
+#             product = product_map[item_id]
+            
+#             try:
+#                 title = product.product_name
+#                 short_desc = product.product_short_description
+#                 long_desc = product.product_long_description
+#                 image_url = product.image_path
+                
+#                 ocr_results = None
+#                 ocr_text = None
+#                 visual_results = None
+
+#                 # Image processing (if enabled)
+#                 if process_image and image_url:
+#                     if ocr_service:
+#                         ocr_results = ocr_service.process_image(image_url)
+                        
+#                         if ocr_results and ocr_results.get("detected_text"):
+#                             ocr_attrs = ProductAttributeService.extract_attributes_from_ocr(
+#                                 ocr_results, model
+#                             )
+#                             ocr_results["extracted_attributes"] = ocr_attrs
+#                             ocr_text = "\n".join([
+#                                 f"{item['text']} (confidence: {item['confidence']:.2f})"
+#                                 for item in ocr_results["detected_text"]
+#                             ])
+                    
+#                     if visual_service:
+#                         product_type_hint = product.product_type if hasattr(product, 'product_type') else None
+#                         visual_results = visual_service.process_image(image_url, product_type_hint)
+                        
+#                         if visual_results and visual_results.get('visual_attributes'):
+#                             visual_results['visual_attributes'] = ProductAttributeService.format_visual_attributes(
+#                                 visual_results['visual_attributes']
+#                             )
+
+#                 # Combine product text with source tracking
+#                 product_text, source_map = ProductAttributeService.combine_product_text(
+#                     title=title,
+#                     short_desc=short_desc,
+#                     long_desc=long_desc,
+#                     ocr_text=ocr_text
+#                 )
+
+#                 # Extract attributes (WITH CACHING ENABLED)
+#                 extracted = ProductAttributeService.extract_attributes(
+#                     product_text=product_text,
+#                     mandatory_attrs=mandatory_attrs,
+#                     source_map=source_map,
+#                     model=model,
+#                     extract_additional=extract_additional,
+#                     multiple=multiple,
+#                     threshold_abs=threshold_abs,
+#                     margin=margin,
+#                     use_dynamic_thresholds=use_dynamic_thresholds,
+#                     use_adaptive_margin=use_adaptive_margin,
+#                     use_semantic_clustering=use_semantic_clustering,
+#                     use_cache=True  # Enable caching!
+#                 )
+
+#                 # Add original values
+#                 original_attrs = original_values_map.get(item_id, {})
+                
+#                 for attr_name, attr_values in extracted.get("mandatory", {}).items():
+#                     if isinstance(attr_values, list):
+#                         for attr_obj in attr_values:
+#                             if isinstance(attr_obj, dict):
+#                                 attr_obj["original_value"] = original_attrs.get(attr_name, "")
+                
+#                 for attr_name, attr_values in extracted.get("additional", {}).items():
+#                     if isinstance(attr_values, list):
+#                         for attr_obj in attr_values:
+#                             if isinstance(attr_obj, dict):
+#                                 attr_obj["original_value"] = original_attrs.get(attr_name, "")
+
+#                 result = {
+#                     "product_id": product.item_id,
+#                     "mandatory": extracted.get("mandatory", {}),
+#                     "additional": extracted.get("additional", {}),
+#                 }
+
+#                 if ocr_results:
+#                     result["ocr_results"] = ocr_results
+                
+#                 if visual_results:
+#                     result["visual_results"] = visual_results
+
+#                 return result, True
+
+#             except Exception as e:
+#                 return {
+#                     "product_id": item_id,
+#                     "error": str(e)
+#                 }, False
+
+#         # OPTIMIZATION 5: Use ThreadPoolExecutor for parallel processing
+#         import concurrent.futures
+#         max_workers = min(10, len(product_list))  # Up to 10 parallel workers
+        
+#         with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+#             # Submit all tasks
+#             future_to_product = {
+#                 executor.submit(process_single_product, product): product
+#                 for product in product_list
+#             }
+            
+#             # Collect results as they complete
+#             for future in concurrent.futures.as_completed(future_to_product):
+#                 try:
+#                     result, success = future.result()
+#                     results.append(result)
+#                     if success:
+#                         successful += 1
+#                     else:
+#                         failed += 1
+#                 except Exception as e:
+#                     failed += 1
+#                     logger.error(f"Unexpected error: {str(e)}")
+#                     results.append({
+#                         "product_id": "unknown",
+#                         "error": str(e)
+#                     })
+
+#         batch_result = {
+#             "results": results,
+#             "total_products": len(product_list),
+#             "successful": successful,
+#             "failed": failed
+#         }
+
+#         response_serializer = BatchProductResponseSerializer(data=batch_result)
+#         if response_serializer.is_valid():
+#             return Response(response_serializer.data, status=status.HTTP_200_OK)
+
+#         return Response(batch_result, status=status.HTTP_200_OK)
+
+
+# ==================== OPTIMIZED BATCH VIEW ====================
+import concurrent.futures
 
 class BatchExtractProductAttributesView(APIView):
     """
-    API endpoint to extract product attributes for multiple products in batch.
-    Uses item-specific mandatory_attrs with source tracking.
-    Returns attributes in array format with original_value field.
-    Includes OCR and Visual Processing results.
+    ⚡ PERFORMANCE OPTIMIZED: Batch extraction with intelligent parallelization
+    Expected performance: 10 products in 30-60 seconds (with image processing)
     """
 
     def post(self, request):
+        import time
+        start_time = time.time()
+        
         serializer = BatchProductRequestSerializer(data=request.data)
         if not serializer.is_valid():
             return Response({"error": serializer.errors}, status=status.HTTP_400_BAD_REQUEST)
 
         validated_data = serializer.validated_data
-        
-        # Get batch-level settings
         product_list = validated_data.get("products", [])
-        model = validated_data.get("model")
-        extract_additional = validated_data.get("extract_additional", True)
-        process_image = validated_data.get("process_image", True)
-        multiple = validated_data.get("multiple", [])
-        threshold_abs = validated_data.get("threshold_abs", 0.65)
-        margin = validated_data.get("margin", 0.15)
-        use_dynamic_thresholds = validated_data.get("use_dynamic_thresholds", True)
-        use_adaptive_margin = validated_data.get("use_adaptive_margin", True)
-        use_semantic_clustering = validated_data.get("use_semantic_clustering", True)
         
-        # Extract all item_ids to query the database efficiently
-        item_ids = [p['item_id'] for p in product_list] 
+        logger.info(f"🚀 Starting batch processing for {len(product_list)} products")
+        
+        # ==================== OPTIMIZATION 1: Bulk DB Query ====================
+        item_ids = [p['item_id'] for p in product_list]
+        products_queryset = Product.objects.filter(
+            item_id__in=item_ids
+        ).prefetch_related('attribute_values')
         
-        # Fetch all products in one query
-        products_queryset = Product.objects.filter(item_id__in=item_ids)
         product_map = {product.item_id: product for product in products_queryset}
-        found_ids = set(product_map.keys())
         
-        # Fetch all original attribute values for these products in one query
+        # Prefetch ALL original attribute values in ONE query
         original_values_qs = ProductAttributeValue.objects.filter(
             product__item_id__in=item_ids
         ).select_related('product')
         
-        # Create a nested dictionary: {item_id: {attribute_name: original_value}}
         original_values_map = {}
         for attr_val in original_values_qs:
             item_id = attr_val.product.item_id
@@ -357,25 +563,53 @@ class BatchExtractProductAttributesView(APIView):
                 original_values_map[item_id] = {}
             original_values_map[item_id][attr_val.attribute_name] = attr_val.original_value
         
+        logger.info(f"✓ Loaded {len(product_map)} products from database")
+        
+        # Extract settings
+        model = validated_data.get("model")
+        extract_additional = validated_data.get("extract_additional", True)
+        process_image = validated_data.get("process_image", True)
+        multiple = validated_data.get("multiple", [])
+        threshold_abs = validated_data.get("threshold_abs", 0.65)
+        margin = validated_data.get("margin", 0.15)
+        use_dynamic_thresholds = validated_data.get("use_dynamic_thresholds", True)
+        use_adaptive_margin = validated_data.get("use_adaptive_margin", True)
+        use_semantic_clustering = validated_data.get("use_semantic_clustering", True)
+        
         results = []
         successful = 0
         failed = 0
+        
+        # ==================== OPTIMIZATION 2: Conditional Service Init ====================
+        # Only initialize if processing images
+        ocr_service = None
+        visual_service = None
+        
+        if process_image:
+            from .ocr_service import OCRService
+            from .visual_processing_service import VisualProcessingService
+            ocr_service = OCRService()
+            visual_service = VisualProcessingService()
+            logger.info("✓ Image processing services initialized")
 
-        for product_entry in product_list:
+        # ==================== OPTIMIZATION 3: Smart Parallelization ====================
+        def process_single_product(product_entry):
+            """Process a single product (runs in parallel)"""
+            import time
+            product_start = time.time()
+            
             item_id = product_entry['item_id']
-            mandatory_attrs = product_entry['mandatory_attrs'] 
+            mandatory_attrs = product_entry['mandatory_attrs']
 
-            if item_id not in found_ids:
-                failed += 1
-                results.append({
+            if item_id not in product_map:
+                return {
                     "product_id": item_id,
                     "error": "Product not found in database"
-                })
-                continue
+                }, False
 
             product = product_map[item_id]
             
-            try: 
+            try:
                 title = product.product_name
                 short_desc = product.product_short_description
                 long_desc = product.product_long_description
@@ -385,32 +619,29 @@ class BatchExtractProductAttributesView(APIView):
                 ocr_text = None
                 visual_results = None
 
-                # Image Processing Logic
+                # ⚡ SKIP IMAGE PROCESSING IF DISABLED (HUGE TIME SAVER)
                 if process_image and image_url:
-                    # OCR Processing
-                    ocr_service = OCRService()
-                    ocr_results = ocr_service.process_image(image_url)
-                    
-                    if ocr_results and ocr_results.get("detected_text"):
-                        ocr_attrs = ProductAttributeService.extract_attributes_from_ocr(
-                            ocr_results, model
-                        )
-                        ocr_results["extracted_attributes"] = ocr_attrs
-                        ocr_text = "\n".join([
-                            f"{item['text']} (confidence: {item['confidence']:.2f})"
-                            for item in ocr_results["detected_text"]
-                        ])
+                    if ocr_service:
+                        ocr_results = ocr_service.process_image(image_url)
+                        
+                        if ocr_results and ocr_results.get("detected_text"):
+                            ocr_attrs = ProductAttributeService.extract_attributes_from_ocr(
+                                ocr_results, model
+                            )
+                            ocr_results["extracted_attributes"] = ocr_attrs
+                            ocr_text = "\n".join([
+                                f"{item['text']} (confidence: {item['confidence']:.2f})"
+                                for item in ocr_results["detected_text"]
+                            ])
                     
-                    # Visual Processing
-                    visual_service = VisualProcessingService()
-                    product_type_hint = product.product_type if hasattr(product, 'product_type') else None
-                    visual_results = visual_service.process_image(image_url, product_type_hint)
-                    
-                    # Format visual attributes to array format with source tracking
-                    if visual_results and visual_results.get('visual_attributes'):
-                        visual_results['visual_attributes'] = ProductAttributeService.format_visual_attributes(
-                            visual_results['visual_attributes']
-                        )
+                    if visual_service:
+                        product_type_hint = product.product_type if hasattr(product, 'product_type') else None
+                        visual_results = visual_service.process_image(image_url, product_type_hint)
+                        
+                        if visual_results and visual_results.get('visual_attributes'):
+                            visual_results['visual_attributes'] = ProductAttributeService.format_visual_attributes(
+                                visual_results['visual_attributes']
+                            )
 
                 # Combine product text with source tracking
                 product_text, source_map = ProductAttributeService.combine_product_text(
@@ -420,7 +651,7 @@ class BatchExtractProductAttributesView(APIView):
                     ocr_text=ocr_text
                 )
 
-                # Attribute Extraction with source tracking (returns array format)
+                # ⚡ EXTRACT ATTRIBUTES WITH CACHING ENABLED
                 extracted = ProductAttributeService.extract_attributes(
                     product_text=product_text,
                     mandatory_attrs=mandatory_attrs,
@@ -432,26 +663,23 @@ class BatchExtractProductAttributesView(APIView):
                     margin=margin,
                     use_dynamic_thresholds=use_dynamic_thresholds,
                     use_adaptive_margin=use_adaptive_margin,
-                    use_semantic_clustering=use_semantic_clustering
+                    use_semantic_clustering=use_semantic_clustering,
+                    use_cache=True  # ⚡ CRITICAL: Enable caching
                 )
 
-                # Add original_value to each extracted attribute
+                # Add original values
                 original_attrs = original_values_map.get(item_id, {})
                 
-                # Process mandatory attributes
                 for attr_name, attr_values in extracted.get("mandatory", {}).items():
                     if isinstance(attr_values, list):
                         for attr_obj in attr_values:
                             if isinstance(attr_obj, dict):
-                                # Add original_value if it exists
                                 attr_obj["original_value"] = original_attrs.get(attr_name, "")
                 
-                # Process additional attributes
                 for attr_name, attr_values in extracted.get("additional", {}).items():
                     if isinstance(attr_values, list):
                         for attr_obj in attr_values:
                             if isinstance(attr_obj, dict):
-                                # Add original_value if it exists
                                 attr_obj["original_value"] = original_attrs.get(attr_name, "")
 
                 result = {
@@ -460,29 +688,82 @@ class BatchExtractProductAttributesView(APIView):
                     "additional": extracted.get("additional", {}),
                 }
 
-                # Attach OCR results if available
                 if ocr_results:
                     result["ocr_results"] = ocr_results
                 
-                # Attach Visual Processing results if available
                 if visual_results:
                     result["visual_results"] = visual_results
+                
+                processing_time = time.time() - product_start
+                logger.info(f"✓ Processed {item_id} in {processing_time:.2f}s")
 
-                results.append(result)
-                successful += 1
+                return result, True
 
             except Exception as e:
-                failed += 1
-                results.append({
+                logger.error(f"❌ Error processing {item_id}: {str(e)}")
+                return {
                     "product_id": item_id,
                     "error": str(e)
-                })
+                }, False
+
+        # ==================== OPTIMIZATION 4: Parallel Execution ====================
+        # Adjust workers based on whether image processing is enabled
+        max_workers = min(3 if process_image else 10, len(product_list))
+        
+        logger.info(f"⚡ Using {max_workers} parallel workers")
+        
+        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+            # Submit all tasks
+            future_to_product = {
+                executor.submit(process_single_product, product): product
+                for product in product_list
+            }
+            
+            # Collect results as they complete
+            for future in concurrent.futures.as_completed(future_to_product):
+                try:
+                    result, success = future.result()
+                    results.append(result)
+                    if success:
+                        successful += 1
+                    else:
+                        failed += 1
+                except Exception as e:
+                    failed += 1
+                    logger.error(f"❌ Future execution error: {str(e)}")
+                    results.append({
+                        "product_id": "unknown",
+                        "error": str(e)
+                    })
+
+        total_time = time.time() - start_time
+        
+        # Get cache statistics
+        cache_stats = ProductAttributeService.get_cache_stats()
+        
+        logger.info(f"""
+🎉 BATCH PROCESSING COMPLETE
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+  Total products: {len(product_list)}
+  Successful: {successful}
+  Failed: {failed}
+  Total time: {total_time:.2f}s
+  Avg time/product: {total_time/len(product_list):.2f}s
+  Cache hit rate: {cache_stats['embedding_cache']['hit_rate_percent']:.1f}%
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+        """)
 
         batch_result = {
             "results": results,
             "total_products": len(product_list),
             "successful": successful,
-            "failed": failed
+            "failed": failed,
+            "performance": {
+                "total_time_seconds": round(total_time, 2),
+                "avg_time_per_product": round(total_time / len(product_list), 2),
+                "workers_used": max_workers
+            },
+            "cache_stats": cache_stats
         }
 
         response_serializer = BatchProductResponseSerializer(data=batch_result)
@@ -493,7 +774,6 @@ class BatchExtractProductAttributesView(APIView):
 
 
 
-
 class ProductListView(APIView):
     """
     GET API to list all products with details
@@ -512,77 +792,6 @@ import pandas as pd
 from .models import Product
 
 
-# class ProductUploadExcelView(APIView):
-#     """
-#     POST API to upload an Excel file and add/update data in Product model.
-#     - Creates new records if they don't exist.
-#     - Updates existing ones (e.g., when image_path or other fields change).
-#     """
-#     parser_classes = (MultiPartParser, FormParser)
-
-#     def post(self, request, *args, **kwargs):
-#         file_obj = request.FILES.get('file')
-#         if not file_obj:
-#             return Response({'error': 'No file provided'}, status=status.HTTP_400_BAD_REQUEST)
-
-#         try:
-#             # Read Excel into DataFrame
-#             df = pd.read_excel(file_obj)
-#             df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]
-
-#             expected_cols = {
-#                 'item_id',
-#                 'product_name',
-#                 'product_long_description',
-#                 'product_short_description',
-#                 'product_type',
-#                 'image_path'
-#             }
-
-#             # Check required columns
-#             if not expected_cols.issubset(df.columns):
-#                 return Response({
-#                     'error': 'Missing required columns',
-#                     'required_columns': list(expected_cols)
-#                 }, status=status.HTTP_400_BAD_REQUEST)
-
-#             created_count = 0
-#             updated_count = 0
-
-#             # Loop through rows and update or create
-#             for _, row in df.iterrows():
-#                 item_id = str(row.get('item_id', '')).strip()
-#                 if not item_id:
-#                     continue  # Skip rows without an item_id
-
-#                 defaults = {
-#                     'product_name': row.get('product_name', ''),
-#                     'product_long_description': row.get('product_long_description', ''),
-#                     'product_short_description': row.get('product_short_description', ''),
-#                     'product_type': row.get('product_type', ''),
-#                     'image_path': row.get('image_path', ''),
-#                 }
-
-#                 obj, created = Product.objects.update_or_create(
-#                     item_id=item_id,
-#                     defaults=defaults
-#                 )
-
-#                 if created:
-#                     created_count += 1
-#                 else:
-#                     updated_count += 1
-
-#             return Response({
-#                 'message': f'Upload successful.',
-#                 'created': f'{created_count} new records added.',
-#                 'updated': f'{updated_count} existing records updated.'
-#             }, status=status.HTTP_201_CREATED)
-
-#         except Exception as e:
-#             return Response({'error': str(e)}, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
-# Replace the ProductUploadExcelView in your views.py with this updated version
-
 from rest_framework.views import APIView
 from rest_framework.response import Response
 from rest_framework import status
@@ -592,835 +801,12 @@ import pandas as pd
 from .models import Product, ProductAttributeValue
 
 
-# class ProductUploadExcelView(APIView):
-#     """
-#     POST API to upload an Excel file with two sheets:
-#     1. 'Products' sheet - Product details
-#     2. 'Attribute_values' sheet - Original attribute values
-    
-#     Creates/updates both products and their attribute values in a single transaction.
-#     """
-#     parser_classes = (MultiPartParser, FormParser)
 
-#     def post(self, request, *args, **kwargs):
-#         file_obj = request.FILES.get('file')
-#         if not file_obj:
-#             return Response({'error': 'No file provided'}, status=status.HTTP_400_BAD_REQUEST)
 
-#         try:
-#             # Read all sheets from Excel file
-#             excel_file = pd.ExcelFile(file_obj)
-            
-#             # Check if required sheets exist
-#             if 'Products' not in excel_file.sheet_names:
-#                 return Response({
-#                     'error': "Missing 'Products' sheet",
-#                     'available_sheets': excel_file.sheet_names
-#                 }, status=status.HTTP_400_BAD_REQUEST)
-            
-#             # Read Products sheet
-#             df_products = pd.read_excel(excel_file, sheet_name='Products')
-#             df_products.columns = [c.strip().lower().replace(' ', '_') for c in df_products.columns]
-
-#             # Check required columns for Products
-#             expected_product_cols = {
-#                 'item_id',
-#                 'product_name',
-#                 'product_long_description',
-#                 'product_short_description',
-#                 'product_type',
-#                 'image_path'
-#             }
 
-#             if not expected_product_cols.issubset(df_products.columns):
-#                 return Response({
-#                     'error': 'Missing required columns in Products sheet',
-#                     'required_columns': list(expected_product_cols),
-#                     'found_columns': list(df_products.columns)
-#                 }, status=status.HTTP_400_BAD_REQUEST)
 
-#             # Read Attribute_values sheet if it exists
-#             df_attributes = None
-#             has_attributes_sheet = 'Attribute_values' in excel_file.sheet_names
-            
-#             if has_attributes_sheet:
-#                 df_attributes = pd.read_excel(excel_file, sheet_name='Attribute_values')
-#                 df_attributes.columns = [c.strip().lower().replace(' ', '_') for c in df_attributes.columns]
-                
-#                 # Check required columns for Attribute_values
-#                 expected_attr_cols = {'item_id', 'attribute_name', 'original_value'}
-#                 if not expected_attr_cols.issubset(df_attributes.columns):
-#                     return Response({
-#                         'error': 'Missing required columns in Attribute_values sheet',
-#                         'required_columns': list(expected_attr_cols),
-#                         'found_columns': list(df_attributes.columns)
-#                     }, status=status.HTTP_400_BAD_REQUEST)
-
-#             # Initialize counters
-#             products_created = 0
-#             products_updated = 0
-#             attributes_created = 0
-#             attributes_updated = 0
-#             products_failed = 0
-#             attributes_failed = 0
-#             errors = []
-
-#             # Use transaction to ensure atomicity
-#             with transaction.atomic():
-#                 # Process Products sheet
-#                 for idx, row in df_products.iterrows():
-#                     item_id = str(row.get('item_id', '')).strip()
-#                     if not item_id:
-#                         products_failed += 1
-#                         errors.append(f"Products Row {idx + 2}: Missing item_id")
-#                         continue
-
-#                     try:
-#                         defaults = {
-#                             'product_name': str(row.get('product_name', '')),
-#                             'product_long_description': str(row.get('product_long_description', '')),
-#                             'product_short_description': str(row.get('product_short_description', '')),
-#                             'product_type': str(row.get('product_type', '')),
-#                             'image_path': str(row.get('image_path', '')),
-#                         }
-
-#                         obj, created = Product.objects.update_or_create(
-#                             item_id=item_id,
-#                             defaults=defaults
-#                         )
 
-#                         if created:
-#                             products_created += 1
-#                         else:
-#                             products_updated += 1
-#                     except Exception as e:
-#                         products_failed += 1
-#                         errors.append(f"Products Row {idx + 2} (item_id: {item_id}): {str(e)}")
-
-#                 # Process Attribute_values sheet if it exists
-#                 if has_attributes_sheet and df_attributes is not None:
-#                     # Group by item_id to optimize lookups
-#                     item_ids_in_attrs = df_attributes['item_id'].unique()
-                    
-#                     # Fetch all products at once
-#                     existing_products = {
-#                         p.item_id: p 
-#                         for p in Product.objects.filter(item_id__in=item_ids_in_attrs)
-#                     }
-
-#                     for idx, row in df_attributes.iterrows():
-#                         item_id = str(row.get('item_id', '')).strip()
-#                         attribute_name = str(row.get('attribute_name', '')).strip()
-#                         original_value = str(row.get('original_value', '')).strip()
-
-#                         if not item_id or not attribute_name:
-#                             attributes_failed += 1
-#                             errors.append(
-#                                 f"Attribute_values Row {idx + 2}: Missing item_id or attribute_name"
-#                             )
-#                             continue
-
-#                         # Check if product exists
-#                         product = existing_products.get(item_id)
-#                         if not product:
-#                             attributes_failed += 1
-#                             errors.append(
-#                                 f"Attribute_values Row {idx + 2}: Product with item_id '{item_id}' not found. "
-#                                 "Make sure it exists in Products sheet."
-#                             )
-#                             continue
-
-#                         try:
-#                             attr_obj, created = ProductAttributeValue.objects.update_or_create(
-#                                 product=product,
-#                                 attribute_name=attribute_name,
-#                                 defaults={'original_value': original_value}
-#                             )
-
-#                             if created:
-#                                 attributes_created += 1
-#                             else:
-#                                 attributes_updated += 1
-#                         except Exception as e:
-#                             attributes_failed += 1
-#                             errors.append(
-#                                 f"Attribute_values Row {idx + 2} "
-#                                 f"(item_id: {item_id}, attribute: {attribute_name}): {str(e)}"
-#                             )
-
-#             # Prepare response
-#             response_data = {
-#                 'message': 'Upload completed successfully',
-#                 'products': {
-#                     'created': products_created,
-#                     'updated': products_updated,
-#                     'failed': products_failed,
-#                     'total_processed': products_created + products_updated + products_failed
-#                 }
-#             }
-
-#             if has_attributes_sheet:
-#                 response_data['attribute_values'] = {
-#                     'created': attributes_created,
-#                     'updated': attributes_updated,
-#                     'failed': attributes_failed,
-#                     'total_processed': attributes_created + attributes_updated + attributes_failed
-#                 }
-#             else:
-#                 response_data['attribute_values'] = {
-#                     'message': 'Attribute_values sheet not found in Excel file'
-#                 }
 
-#             if errors:
-#                 response_data['errors'] = errors[:50]  # Limit to first 50 errors
-#                 if len(errors) > 50:
-#                     response_data['errors'].append(f"... and {len(errors) - 50} more errors")
-
-#             # Determine status code
-#             if products_failed > 0 or attributes_failed > 0:
-#                 status_code = status.HTTP_207_MULTI_STATUS
-#             else:
-#                 status_code = status.HTTP_201_CREATED
-
-#             return Response(response_data, status=status_code)
-
-#         except pd.errors.EmptyDataError:
-#             return Response({
-#                 'error': 'The uploaded Excel file is empty or invalid'
-#             }, status=status.HTTP_400_BAD_REQUEST)
-#         except Exception as e:
-#             return Response({
-#                 'error': f'An error occurred while processing the file: {str(e)}'
-#             }, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
-
-
-
-
-
-
-
-# import logging
-# import json
-# from rest_framework.views import APIView
-# from rest_framework.response import Response
-# from rest_framework import status
-# from rest_framework.parsers import MultiPartParser, FormParser
-# from django.db import transaction
-# from django.db.models import Prefetch, F
-# import pandas as pd
-# # Import ALL your models
-# from .models import Product, ProductAttributeValue, ProductType, ProductAttribute, AttributePossibleValue
-# from .services import ProductAttributeService
-# from .ocr_service import OCRService
-# from .visual_processing_service import VisualProcessingService
-# from openpyxl import Workbook
-# from openpyxl.styles import Font, PatternFill, Alignment
-# from django.conf import settings
-# import os
-# import threading
-# from datetime import datetime
-
-# # --- Logging Setup ---
-# # Define log and status file paths in MEDIA_ROOT
-# LOG_FILE_PATH = os.path.join(settings.MEDIA_ROOT, 'excel_generation.log')
-# STATUS_FILE_PATH = os.path.join(settings.MEDIA_ROOT, 'excel_generation_status.json')
-
-# # Ensure the MEDIA_ROOT exists for files to be saved
-# if not os.path.exists(settings.MEDIA_ROOT):
-#     os.makedirs(settings.MEDIA_ROOT)
-
-# # Configure basic logging
-# logging.basicConfig(
-#     filename=LOG_FILE_PATH,
-#     level=logging.INFO,
-#     format='%(asctime)s - %(levelname)s - %(message)s'
-# )
-# logger = logging.getLogger(__name__)
-
-# # -------------------------------------------------------------------------------------------------
-
-# def generate_product_excel_background():
-#     """
-#     Function to perform batch attribute extraction for all products and generate an Excel file.
-#     Runs in a background thread to avoid blocking the API response.
-#     Logs success/failure and saves a status file for external monitoring.
-#     """
-#     logger.info(f"[{datetime.now().isoformat()}] Starting background product Excel generation and attribute extraction.")
-    
-#     successful = 0
-#     failed = 0
-#     results = [] # To store detailed extraction results for Excel sheet 2
-    
-#     # Function to write status file (SUCCESS/FAILED)
-#     def write_status(status_type, error_msg=None):
-#         status_data = {
-#             "status": status_type,
-#             "timestamp": datetime.now().isoformat(),
-#             "products_processed": successful + failed,
-#             "products_successful": successful,
-#             "products_failed": failed,
-#             "excel_path": os.path.join(settings.MEDIA_URL, 'generated_products.xlsx') if status_type == "SUCCESS" else None,
-#             "log_path": os.path.join(settings.MEDIA_URL, 'excel_generation.log'),
-#             "error_message": error_msg
-#         }
-#         try:
-#             with open(STATUS_FILE_PATH, 'w') as f:
-#                 json.dump(status_data, f, indent=4)
-#         except Exception as e:
-#             logger.exception(f"CRITICAL ERROR: Failed to write status file at {STATUS_FILE_PATH}: {e}")
-
-#     try:
-#         # 1. PREFETCH all necessary related data to minimize database queries
-        
-#         # Prefetch possible values for mandatory attributes
-#         possible_values_prefetch = Prefetch(
-#             'attributes',
-#             queryset=ProductAttribute.objects.filter(is_mandatory=True).prefetch_related('possible_values')
-#         )
-        
-#         # Fetch all ProductTypes with their mandatory attributes and possible values
-#         all_product_types = ProductType.objects.prefetch_related(possible_values_prefetch)
-#         product_type_map = {
-#             pt.name: pt for pt in all_product_types
-#         }
-
-#         # Prepare product_list for batch extraction
-#         all_products = Product.objects.all()
-#         product_list = []
-        
-#         for p in all_products:
-#             # mandatory_attrs will be the dictionary required by the service
-#             mandatory_attrs_dict = {}
-#             product_type_name = p.product_type.strip() if p.product_type else None
-            
-#             if product_type_name and product_type_name in product_type_map:
-#                 pt = product_type_map[product_type_name]
-                
-#                 # Build the mandatory_attrs dictionary: { "Attribute Name": ["Value 1", "Value 2"], ... }
-#                 for attr in pt.attributes.all(): # .all() here works because we used Prefetch for 'attributes'
-#                     # attr.possible_values.all() works because we used prefetch_related('possible_values')
-#                     mandatory_attrs_dict[attr.name] = [
-#                         pv.value for pv in attr.possible_values.all()
-#                     ]
-            
-#             product_list.append({
-#                 "item_id": p.item_id,
-#                 "product_type_name": product_type_name,
-#                 "mandatory_attrs": mandatory_attrs_dict # <-- FIX: Pass the dictionary here
-#             })
-
-#         # Batch settings (using defaults)
-#         model = "llama-3.1-8b-instant"
-#         extract_additional = True
-#         process_image = False
-#         multiple = []
-#         threshold_abs = 0.65
-#         margin = 0.15
-#         use_dynamic_thresholds = True
-#         use_adaptive_margin = True
-#         use_semantic_clustering = True
-
-#         # Batch extraction logic
-#         item_ids = [p['item_id'] for p in product_list]
-#         products_queryset = Product.objects.filter(item_id__in=item_ids)
-#         product_map = {product.item_id: product for product in products_queryset}
-#         found_ids = set(product_map.keys())
-
-#         for product_entry in product_list:
-#             item_id = product_entry['item_id']
-#             # FIX: mandatory_attrs is now correctly a dictionary (or an empty dictionary)
-#             mandatory_attrs = product_entry['mandatory_attrs'] 
-
-#             if item_id not in found_ids:
-#                 failed += 1
-#                 results.append({
-#                     "product_id": item_id,
-#                     "error": "Product not found in database"
-#                 })
-#                 logger.warning(f"Product {item_id} not found in database. Skipping extraction.")
-#                 continue
-
-#             product = product_map[item_id]
-
-#             try:
-#                 title = product.product_name
-#                 short_desc = product.product_short_description
-#                 long_desc = product.product_long_description
-#                 image_url = product.image_path
-
-#                 ocr_results = None
-#                 ocr_text = None
-#                 visual_results = None
-
-#                 if process_image and image_url:
-#                     logger.info(f"Processing image for product {item_id}...")
-#                     # OCR Processing
-#                     ocr_service = OCRService()
-#                     ocr_results = ocr_service.process_image(image_url)
-
-#                     if ocr_results and ocr_results.get("detected_text"):
-#                         # NOTE: Assuming ProductAttributeService.extract_attributes_from_ocr exists
-#                         ocr_attrs = ProductAttributeService.extract_attributes_from_ocr(
-#                              ocr_results, model
-#                         )
-#                         ocr_results["extracted_attributes"] = ocr_attrs
-#                         ocr_text = "\n".join([
-#                              f"{item['text']} (confidence: {item['confidence']:.2f})"
-#                              for item in ocr_results["detected_text"]
-#                         ])
-
-#                     # Visual Processing
-#                     visual_service = VisualProcessingService()
-#                     product_type_hint = product.product_type if product.product_type else None
-#                     visual_results = visual_service.process_image(image_url, product_type_hint)
-
-#                     if visual_results and visual_results.get('visual_attributes'):
-#                         visual_results['visual_attributes'] = ProductAttributeService.format_visual_attributes(
-#                             visual_results['visual_attributes']
-#                         )
-#                     logger.info(f"Image processing done for product {item_id}.")
-
-
-#                 # Combine product text with source tracking
-#                 product_text, source_map = ProductAttributeService.combine_product_text(
-#                     title=title,
-#                     short_desc=short_desc,
-#                     long_desc=long_desc,
-#                     ocr_text=ocr_text
-#                 )
-
-#                 # Attribute Extraction with source tracking
-#                 extracted = ProductAttributeService.extract_attributes(
-#                     product_text=product_text,
-#                     mandatory_attrs=mandatory_attrs, # <-- This is now the dictionary with possible values
-#                     source_map=source_map,
-#                     model=model,
-#                     extract_additional=extract_additional,
-#                     multiple=multiple,
-#                     threshold_abs=threshold_abs,
-#                     margin=margin,
-#                     use_dynamic_thresholds=use_dynamic_thresholds,
-#                     use_adaptive_margin=use_adaptive_margin,
-#                     use_semantic_clustering=use_semantic_clustering
-#                 )
-
-#                 result = {
-#                     "product_id": item_id,
-#                     "mandatory": extracted.get("mandatory", {}),
-#                     "additional": extracted.get("additional", {}),
-#                 }
-
-#                 if ocr_results:
-#                     result["ocr_results"] = ocr_results
-
-#                 if visual_results:
-#                     result["visual_results"] = visual_results
-
-#                 results.append(result)
-#                 successful += 1
-#                 logger.info(f"Attribute extraction successful for product {item_id}.")
-
-#             except Exception as e:
-#                 failed += 1
-#                 results.append({
-#                     "product_id": item_id,
-#                     "error": str(e)
-#                 })
-#                 # Original Error: AttributeError: 'list' object has no attribute 'items'
-#                 # This should now be fixed, but we keep the robust exception handling.
-#                 logger.exception(f"Error during attribute extraction for product {item_id}.")
-
-#         logger.info(f"Batch extraction phase complete. Successful: {successful}, Failed: {failed}")
-        
-#         # --------------------------------------------------------------------------------
-#         # Generate and save the Excel file (Unchanged)
-#         # --------------------------------------------------------------------------------
-#         wb = Workbook()
-
-#         # Sheet 1: Products (from DB)
-#         ws_products = wb.active
-#         ws_products.title = "Products"
-#         products_headers = ['ITEM ID', 'PRODUCT NAME', 'PRODUCT TYPE', 'Product Short Description', 'Product Long Description', 'image_path']
-#         header_fill = PatternFill(start_color="366092", end_color="366092", fill_type="solid")
-#         header_font = Font(bold=True, color="FFFFFF")
-
-#         for col_num, header in enumerate(products_headers, 1):
-#             cell = ws_products.cell(row=1, column=col_num)
-#             cell.value = header
-#             cell.fill = header_fill
-#             cell.font = header_font
-#             cell.alignment = Alignment(horizontal="center", vertical="center")
-
-#         all_products_db = Product.objects.all()
-#         for row_num, product in enumerate(all_products_db, 2):
-#             ws_products.cell(row=row_num, column=1, value=product.item_id)
-#             ws_products.cell(row=row_num, column=2, value=product.product_name)
-#             ws_products.cell(row=row_num, column=3, value=product.product_type)
-#             ws_products.cell(row=row_num, column=4, value=product.product_short_description)
-#             ws_products.cell(row=row_num, column=5, value=product.product_long_description)
-#             ws_products.cell(row=row_num, column=6, value=product.image_path)
-
-#         # Adjust column widths
-#         for col_dim, width in zip(['A', 'B', 'C', 'D', 'E', 'F'], [15, 25, 15, 35, 50, 45]):
-#              ws_products.column_dimensions[col_dim].width = width
-
-#         # Sheet 2: Attribute_values (item_id, attribute_name, original_value, generated_value)
-#         ws_attributes = wb.create_sheet("Attribute_values")
-#         attributes_headers = ['item_id', 'attribute_name', 'original_value', 'generated_value']
-#         for col_num, header in enumerate(attributes_headers, 1):
-#             cell = ws_attributes.cell(row=1, column=col_num)
-#             cell.value = header
-#             cell.fill = header_fill
-#             cell.font = header_font
-#             cell.alignment = Alignment(horizontal="center", vertical="center")
-
-#         # Fetch all original attributes
-#         row_num = 2
-#         all_original_attrs = ProductAttributeValue.objects.all()
-#         # Create a lookup for original attributes by item_id and attribute_name
-#         original_attrs_lookup = {
-#             (attr.product.item_id, attr.attribute_name): attr.original_value
-#             for attr in all_original_attrs
-#         }
-
-#         # Add attributes (original and generated)
-#         processed_original_keys = set()
-#         for res in results:
-#             item_id = res["product_id"]
-
-#             if "error" in res:
-#                 # Add existing original attributes for failed products to the sheet
-#                 for (orig_item_id, orig_attr_name), orig_value in original_attrs_lookup.items():
-#                     if orig_item_id == item_id:
-#                         ws_attributes.cell(row=row_num, column=1, value=orig_item_id)
-#                         ws_attributes.cell(row=row_num, column=2, value=orig_attr_name)
-#                         ws_attributes.cell(row=row_num, column=3, value=orig_value)
-#                         ws_attributes.cell(row=row_num, column=4, value=f"Extraction Failed: {res['error']}")
-#                         processed_original_keys.add((orig_item_id, orig_attr_name))
-#                         row_num += 1
-#                 continue
-
-#             # Combine all generated attributes (mandatory, additional, OCR, visual)
-#             generated_attrs = {}
-#             for cat in ["mandatory", "additional"]:
-#                 attrs = res.get(cat, {})
-#                 for attr_name, values in attrs.items():
-#                     for val in values:
-#                         key = (item_id, attr_name)
-#                         if key not in generated_attrs:
-#                             generated_attrs[key] = []
-#                         generated_attrs[key].append(f"{val['value']} (source: {val['source']})")
-
-#             # OCR extracted
-#             ocr = res.get("ocr_results")
-#             if ocr and "extracted_attributes" in ocr and isinstance(ocr["extracted_attributes"], dict):
-#                 for attr_name, values in ocr["extracted_attributes"].items():
-#                     for val in values:
-#                         key = (item_id, attr_name)
-#                         if key not in generated_attrs:
-#                             generated_attrs[key] = []
-#                         generated_attrs[key].append(f"{val['value']} (source: {val['source']})")
-
-#             # Visual extracted
-#             visual = res.get("visual_results")
-#             if visual and "visual_attributes" in visual:
-#                 vis_attrs = visual["visual_attributes"]
-#                 if isinstance(vis_attrs, dict):
-#                     # Handle dict format where value might be a list of dicts or a single value
-#                     for attr_name, values in vis_attrs.items():
-#                         if not isinstance(values, list):
-#                             values = [{"value": values, "source": "visual"}]
-#                         for val in values:
-#                             key = (item_id, attr_name)
-#                             if key not in generated_attrs:
-#                                 generated_attrs[key] = []
-#                             generated_attrs[key].append(f"{val['value']} (source: {val.get('source', 'visual')})")
-#                 elif isinstance(vis_attrs, list):
-#                     # Handle list of dicts format
-#                     for item in vis_attrs:
-#                         attr_name = item.get("attribute_name") or item.get("name")
-#                         if not attr_name: continue
-#                         value = item.get("value", "")
-#                         source = item.get("source", "visual")
-#                         key = (item_id, attr_name)
-#                         if key not in generated_attrs:
-#                             generated_attrs[key] = []
-#                         generated_attrs[key].append(f"{value} (source: {source})")
-
-
-#             # Write attributes to Excel
-#             for (attr_item_id, attr_name), gen_values in generated_attrs.items():
-#                 # Get original value from lookup (if it exists)
-#                 original_value = original_attrs_lookup.get((attr_item_id, attr_name), "")
-#                 # Combine multiple generated values into a single string
-#                 generated_value = "; ".join(gen_values) if gen_values else ""
-#                 # Write row
-#                 ws_attributes.cell(row=row_num, column=1, value=attr_item_id)
-#                 ws_attributes.cell(row=row_num, column=2, value=attr_name)
-#                 ws_attributes.cell(row=row_num, column=3, value=original_value)
-#                 ws_attributes.cell(row=row_num, column=4, value=generated_value)
-#                 processed_original_keys.add((attr_item_id, attr_name))
-#                 row_num += 1
-
-#             # Add original attributes that have no generated values for this item_id
-#             for (orig_item_id, orig_attr_name), orig_value in original_attrs_lookup.items():
-#                 if orig_item_id == item_id and (orig_item_id, orig_attr_name) not in processed_original_keys:
-#                     ws_attributes.cell(row=row_num, column=1, value=orig_item_id)
-#                     ws_attributes.cell(row=row_num, column=2, value=orig_attr_name)
-#                     ws_attributes.cell(row=row_num, column=3, value=orig_value)
-#                     ws_attributes.cell(row=row_num, column=4, value="") # No generated value
-#                     processed_original_keys.add((orig_item_id, orig_attr_name))
-#                     row_num += 1
-        
-#         # Add original attributes for products not included in the 'results' (e.g. if they didn't exist in product_list)
-#         # We assume all products are in product_list, so this step might be redundant, but safe for completeness.
-#         for (orig_item_id, orig_attr_name), orig_value in original_attrs_lookup.items():
-#             if (orig_item_id, orig_attr_name) not in processed_original_keys:
-#                 ws_attributes.cell(row=row_num, column=1, value=orig_item_id)
-#                 ws_attributes.cell(row=row_num, column=2, value=orig_attr_name)
-#                 ws_attributes.cell(row=row_num, column=3, value=orig_value)
-#                 ws_attributes.cell(row=row_num, column=4, value="Original value only (Product not processed in batch)")
-#                 row_num += 1
-
-
-#         # Adjust column widths for attributes
-#         for col_dim, width in zip(['A', 'B', 'C', 'D'], [15, 35, 50, 50]):
-#              ws_attributes.column_dimensions[col_dim].width = width
-
-#         # Save the generated Excel (replace existing)
-#         save_path = os.path.join(settings.MEDIA_ROOT, 'generated_products.xlsx')
-#         wb.save(save_path)
-#         logger.info(f"Excel file successfully saved to {save_path}")
-        
-#         # Write SUCCESS status
-#         write_status("SUCCESS")
-#         logger.info("Background task finished successfully.")
-
-
-#     except Exception as e:
-#         # Log the critical error and write FAILED status
-#         logger.exception("CRITICAL ERROR during background Excel generation process.")
-#         write_status("FAILED", error_msg=str(e))
-
-
-# # -------------------------------------------------------------------------------------------------
-
-# class ProductUploadExcelView(APIView):
-#     """
-#     POST API to upload an Excel file. (Unchanged)
-#     """
-#     parser_classes = (MultiPartParser, FormParser)
-
-#     def post(self, request, *args, **kwargs):
-#         file_obj = request.FILES.get('file')
-#         if not file_obj:
-#             return Response({'error': 'No file provided'}, status=status.HTTP_400_BAD_REQUEST)
-
-#         try:
-#             # Read all sheets from Excel file
-#             excel_file = pd.ExcelFile(file_obj)
-            
-#             # Check if required sheets exist
-#             if 'Products' not in excel_file.sheet_names:
-#                  logger.error(f"Upload failed: Missing 'Products' sheet in file.")
-#                  return Response({
-#                      'error': "Missing 'Products' sheet",
-#                      'available_sheets': excel_file.sheet_names
-#                  }, status=status.HTTP_400_BAD_REQUEST)
-            
-#             # Read Products sheet
-#             df_products = pd.read_excel(excel_file, sheet_name='Products')
-#             df_products.columns = [c.strip().lower().replace(' ', '_') for c in df_products.columns]
-
-#             # Check required columns for Products
-#             expected_product_cols = {
-#                  'item_id', 'product_name', 'product_long_description',
-#                  'product_short_description', 'product_type', 'image_path'
-#             }
-
-#             if not expected_product_cols.issubset(df_products.columns):
-#                  logger.error(f"Upload failed: Missing required columns in Products sheet.")
-#                  return Response({
-#                      'error': 'Missing required columns in Products sheet',
-#                      'required_columns': list(expected_product_cols),
-#                      'found_columns': list(df_products.columns)
-#                  }, status=status.HTTP_400_BAD_REQUEST)
-
-#             # Read Attribute_values sheet if it exists
-#             df_attributes = None
-#             has_attributes_sheet = 'Attribute_values' in excel_file.sheet_names
-            
-#             if has_attributes_sheet:
-#                  df_attributes = pd.read_excel(excel_file, sheet_name='Attribute_values')
-#                  df_attributes.columns = [c.strip().lower().replace(' ', '_') for c in df_attributes.columns]
-                 
-#                  # Check required columns for Attribute_values
-#                  expected_attr_cols = {'item_id', 'attribute_name', 'original_value'}
-#                  if not expected_attr_cols.issubset(df_attributes.columns):
-#                      logger.error(f"Upload failed: Missing required columns in Attribute_values sheet.")
-#                      return Response({
-#                           'error': 'Missing required columns in Attribute_values sheet',
-#                           'required_columns': list(expected_attr_cols),
-#                           'found_columns': list(df_attributes.columns)
-#                      }, status=status.HTTP_400_BAD_REQUEST)
-
-#             # Initialize counters
-#             products_created = 0
-#             products_updated = 0
-#             attributes_created = 0
-#             attributes_updated = 0
-#             products_failed = 0
-#             attributes_failed = 0
-#             errors = []
-
-#             # Use transaction to ensure atomicity
-#             with transaction.atomic():
-#                  # Process Products sheet
-#                  for idx, row in df_products.iterrows():
-#                      item_id = str(row.get('item_id', '')).strip()
-#                      product_type = str(row.get('product_type', '')).strip()
-
-#                      if not item_id:
-#                          products_failed += 1
-#                          errors.append(f"Products Row {idx + 2}: Missing item_id")
-#                          continue
-
-#                      try:
-#                          # Auto-create ProductType if provided and doesn't exist
-#                          if product_type:
-#                              ProductType.objects.get_or_create(name=product_type)
-
-#                          defaults = {
-#                              'product_name': str(row.get('product_name', '')),
-#                              'product_long_description': str(row.get('product_long_description', '')),
-#                              'product_short_description': str(row.get('product_short_description', '')),
-#                              'product_type': product_type,
-#                              'image_path': str(row.get('image_path', '')),
-#                          }
-
-#                          obj, created = Product.objects.update_or_create(
-#                              item_id=item_id,
-#                              defaults=defaults
-#                          )
-
-#                          if created:
-#                              products_created += 1
-#                          else:
-#                              products_updated += 1
-#                      except Exception as e:
-#                          products_failed += 1
-#                          errors.append(f"Products Row {idx + 2} (item_id: {item_id}): {str(e)}")
-#                          logger.error(f"Error processing product {item_id} in Products sheet: {e}")
-
-
-#                  # Process Attribute_values sheet if it exists
-#                  if has_attributes_sheet and df_attributes is not None:
-#                       # Group by item_id to optimize lookups
-#                       item_ids_in_attrs = df_attributes['item_id'].astype(str).unique()
-                      
-#                       # Fetch all products at once
-#                       existing_products = {
-#                           p.item_id: p 
-#                           for p in Product.objects.filter(item_id__in=item_ids_in_attrs)
-#                       }
-
-#                       for idx, row in df_attributes.iterrows():
-#                           item_id = str(row.get('item_id', '')).strip()
-#                           attribute_name = str(row.get('attribute_name', '')).strip()
-#                           original_value = str(row.get('original_value', '')).strip()
-
-#                           if not item_id or not attribute_name:
-#                               attributes_failed += 1
-#                               errors.append(
-#                                   f"Attribute_values Row {idx + 2}: Missing item_id or attribute_name"
-#                               )
-#                               continue
-
-#                           # Check if product exists
-#                           product = existing_products.get(item_id)
-#                           if not product:
-#                               attributes_failed += 1
-#                               errors.append(
-#                                   f"Attribute_values Row {idx + 2}: Product with item_id '{item_id}' not found. "
-#                                   "Make sure it exists in Products sheet."
-#                               )
-#                               continue
-
-#                           try:
-#                               attr_obj, created = ProductAttributeValue.objects.update_or_create(
-#                                   product=product,
-#                                   attribute_name=attribute_name,
-#                                   defaults={'original_value': original_value}
-#                               )
-
-#                               if created:
-#                                   attributes_created += 1
-#                               else:
-#                                   attributes_updated += 1
-#                           except Exception as e:
-#                               attributes_failed += 1
-#                               errors.append(
-#                                   f"Attribute_values Row {idx + 2} "
-#                                   f"(item_id: {item_id}, attribute: {attribute_name}): {str(e)}"
-#                               )
-#                               logger.error(f"Error processing attribute {attribute_name} for product {item_id}: {e}")
-
-#             # Prepare response data
-#             response_data = {
-#                 'message': 'Upload completed',
-#                 'products': {
-#                     'created': products_created,
-#                     'updated': products_updated,
-#                     'failed': products_failed,
-#                     'total_processed': products_created + products_updated + products_failed
-#                 },
-#                 'attribute_values': {
-#                      'created': attributes_created,
-#                      'updated': attributes_updated,
-#                      'failed': attributes_failed,
-#                      'total_processed': attributes_created + attributes_updated + attributes_failed
-#                 } if has_attributes_sheet else {'message': 'Attribute_values sheet not found in Excel file'},
-#                 'generated_excel_status': 'Excel generation started in the background.'
-#             }
-
-#             if errors:
-#                 response_data['errors'] = errors[:50]
-#                 if len(errors) > 50:
-#                     response_data['errors'].append(f"... and {len(errors) - 50} more errors")
-
-#             # Determine status code for upload
-#             upload_status = status.HTTP_201_CREATED if products_failed == 0 and attributes_failed == 0 else status.HTTP_207_MULTI_STATUS
-
-#             # Start background thread for Excel generation if upload was successful
-#             if products_failed == 0 and attributes_failed == 0:
-#                 logger.info("API call successful. Triggering background Excel generation thread.")
-#                 threading.Thread(target=generate_product_excel_background, daemon=True).start()
-                
-#                 # Update response to provide monitoring paths
-#                 response_data['generated_excel_status'] = 'Background Excel generation triggered successfully.'
-#                 response_data['monitoring'] = {
-#                      'excel_file': os.path.join(settings.MEDIA_URL, 'generated_products.xlsx'),
-#                      'status_file': os.path.join(settings.MEDIA_URL, 'excel_generation_status.json'),
-#                      'log_file': os.path.join(settings.MEDIA_URL, 'excel_generation.log'),
-#                      'note': 'These files will be available once the background process completes.'
-#                 }
-#             else:
-#                  logger.warning(f"API call finished with errors ({products_failed} products, {attributes_failed} attributes). Not triggering background excel generation.")
-#                  response_data['generated_excel_status'] = 'Background Excel generation was NOT triggered due to upload errors. Fix upload errors and re-upload.'
-
-
-#             return Response(response_data, status=upload_status)
-
-#         except pd.errors.EmptyDataError:
-#             logger.error('The uploaded Excel file is empty or invalid.')
-#             return Response({
-#                 'error': 'The uploaded Excel file is empty or invalid'
-#             }, status=status.HTTP_400_BAD_REQUEST)
-#         except Exception as e:
-#             logger.exception(f'An unexpected error occurred while processing the file.')
-#             return Response({
-#                 'error': f'An unexpected error occurred while processing the file: {str(e)}'
-#             }, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
 
 
 

+ 94 - 29
attr_extraction/visual_processing_service.py

@@ -380,6 +380,11 @@ from sklearn.cluster import KMeans
 
 logger = logging.getLogger(__name__)
 
+import os
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'  # Disable tokenizer warnings
+import warnings
+warnings.filterwarnings('ignore')  # Suppress all warnings
+
 
 class VisualProcessingService:
     """Service for extracting visual attributes from product images using CLIP with smart subcategory detection."""
@@ -585,6 +590,57 @@ class VisualProcessingService:
         else:
             return 'gray'
     
+    # def classify_with_clip(
+    #     self,
+    #     image: Image.Image,
+    #     candidates: List[str],
+    #     attribute_name: str,
+    #     confidence_threshold: float = 0.15
+    # ) -> Dict:
+    #     """Use CLIP to classify image against candidate labels."""
+    #     try:
+    #         model, processor = self._get_clip_model()
+    #         device = self._get_device()
+            
+    #         # Prepare inputs
+    #         inputs = processor(
+    #             text=candidates,
+    #             images=image,
+    #             return_tensors="pt",
+    #             padding=True
+    #         )
+            
+    #         # Move to device
+    #         inputs = {k: v.to(device) for k, v in inputs.items()}
+            
+    #         # Get predictions
+    #         with torch.no_grad():
+    #             outputs = model(**inputs)
+    #             logits_per_image = outputs.logits_per_image
+    #             probs = logits_per_image.softmax(dim=1).cpu()
+            
+    #         # Get top predictions
+    #         top_k = min(3, len(candidates))
+    #         top_probs, top_indices = torch.topk(probs[0], k=top_k)
+            
+    #         results = []
+    #         for prob, idx in zip(top_probs, top_indices):
+    #             if prob.item() > confidence_threshold:
+    #                 results.append({
+    #                     "value": candidates[idx.item()],
+    #                     "confidence": round(float(prob.item()), 3)
+    #                 })
+            
+    #         return {
+    #             "attribute": attribute_name,
+    #             "predictions": results
+    #         }
+            
+    #     except Exception as e:
+    #         logger.error(f"Error in CLIP classification for {attribute_name}: {str(e)}")
+    #         return {"attribute": attribute_name, "predictions": []}
+    
+
     def classify_with_clip(
         self,
         image: Image.Image,
@@ -597,44 +653,54 @@ class VisualProcessingService:
             model, processor = self._get_clip_model()
             device = self._get_device()
             
-            # Prepare inputs
-            inputs = processor(
-                text=candidates,
-                images=image,
-                return_tensors="pt",
-                padding=True
-            )
-            
-            # Move to device
-            inputs = {k: v.to(device) for k, v in inputs.items()}
+            # ⚡ OPTIMIZATION: Process in smaller batches to avoid memory issues
+            batch_size = 16  # Process 16 candidates at a time
+            all_results = []
             
-            # Get predictions
-            with torch.no_grad():
-                outputs = model(**inputs)
-                logits_per_image = outputs.logits_per_image
-                probs = logits_per_image.softmax(dim=1).cpu()
-            
-            # Get top predictions
-            top_k = min(3, len(candidates))
-            top_probs, top_indices = torch.topk(probs[0], k=top_k)
+            for i in range(0, len(candidates), batch_size):
+                batch_candidates = candidates[i:i + batch_size]
+                
+                # Prepare inputs WITHOUT progress bars
+                inputs = processor(
+                    text=batch_candidates,
+                    images=image,
+                    return_tensors="pt",
+                    padding=True
+                )
+                
+                # Move to device
+                inputs = {k: v.to(device) for k, v in inputs.items()}
+                
+                # Get predictions
+                with torch.no_grad():
+                    outputs = model(**inputs)
+                    logits_per_image = outputs.logits_per_image
+                    probs = logits_per_image.softmax(dim=1).cpu()
+                
+                # Collect results from this batch
+                for j, prob in enumerate(probs[0]):
+                    if prob.item() > confidence_threshold:
+                        all_results.append({
+                            "value": batch_candidates[j],
+                            "confidence": round(float(prob.item()), 3)
+                        })
             
-            results = []
-            for prob, idx in zip(top_probs, top_indices):
-                if prob.item() > confidence_threshold:
-                    results.append({
-                        "value": candidates[idx.item()],
-                        "confidence": round(float(prob.item()), 3)
-                    })
+            # Sort by confidence and return top 3
+            all_results.sort(key=lambda x: x['confidence'], reverse=True)
             
             return {
                 "attribute": attribute_name,
-                "predictions": results
+                "predictions": all_results[:3]
             }
             
         except Exception as e:
             logger.error(f"Error in CLIP classification for {attribute_name}: {str(e)}")
             return {"attribute": attribute_name, "predictions": []}
-    
+
+
+
+
+
     def detect_category_and_subcategory(self, image: Image.Image) -> Tuple[str, str, str, float]:
         """
         Hierarchically detect category, subcategory, and specific product.
@@ -869,7 +935,6 @@ class VisualProcessingService:
 
 
 
-
 
 
 # # ==================== visual_processing_service_enhanced.py ====================

BIN
db.sqlite3


+ 1 - 1
media/generated_outputs/excel_generation_status.json

@@ -1,6 +1,6 @@
 {
     "status": "SUCCESS",
-    "timestamp": "2025-10-27T15:43:17.202230",
+    "timestamp": "2025-10-28T11:44:45.161843",
     "products_processed": 15,
     "products_successful": 15,
     "products_failed": 0,

BIN
media/generated_outputs/generated_products.xlsx


BIN
media/generated_outputs/~$generated_products.xlsx


이 변경점에서 너무 많은 파일들이 변경되어 몇몇 파일들은 표시되지 않았습니다.