Student Yadav vor 3 Monaten
Ursprung
Commit
9abba55d72
3 geänderte Dateien mit 421 neuen und 980 gelöschten Zeilen
  1. 369 563
      attr_extraction/services.py
  2. 52 417
      attr_extraction/views.py
  3. BIN
      db.sqlite3

Datei-Diff unterdrückt, da er zu groß ist
+ 369 - 563
attr_extraction/services.py


+ 52 - 417
attr_extraction/views.py

@@ -1,246 +1,29 @@
-# #  #==================== views.py ====================
-# # from rest_framework.views import APIView
-# # from rest_framework.response import Response
-# # from rest_framework import status
-# # from .serializers import (
-# #     ProductAttributeRequestSerializer,
-# #     ProductAttributeResponseSerializer
-# # )
-# # from .services import ProductAttributeService
-
-
-# # class ExtractProductAttributesView(APIView):
-# #     """
-# #     API endpoint to extract product attributes using Groq LLM.
-    
-# #     POST /api/extract-attributes/
-    
-# #     Request Body:
-# #     {
-# #         "title": "Product title (optional)",
-# #         "short_desc": "Short description (optional)",
-# #         "long_desc": "Long description (optional)",
-# #         "mandatory_attrs": {
-# #             "Attribute1": ["value1", "value2", "value3"],
-# #             "Attribute2": ["valueA", "valueB"]
-# #         },
-# #         "model": "llama-3.1-8b-instant (optional)",
-# #         "extract_additional": true (optional, default: true)
-# #     }
-    
-# #     Response:
-# #     {
-# #         "mandatory": {
-# #             "Attribute1": "value1",
-# #             "Attribute2": "valueA"
-# #         },
-# #         "additional": {
-# #             "Color": "Blue",
-# #             "Brand": "Example"
-# #         }
-# #     }
-# #     """
-
-# #     def post(self, request):
-# #         # Validate request data
-# #         serializer = ProductAttributeRequestSerializer(data=request.data)
-# #         if not serializer.is_valid():
-# #             return Response(
-# #                 {"error": serializer.errors},
-# #                 status=status.HTTP_400_BAD_REQUEST
-# #             )
-
-# #         validated_data = serializer.validated_data
-
-# #         # Combine product text
-# #         product_text = ProductAttributeService.combine_product_text(
-# #             title=validated_data.get('title'),
-# #             short_desc=validated_data.get('short_desc'),
-# #             long_desc=validated_data.get('long_desc')
-# #         )
-
-# #         # Extract attributes
-# #         result = ProductAttributeService.extract_attributes(
-# #             product_text=product_text,
-# #             mandatory_attrs=validated_data['mandatory_attrs'],
-# #             model=validated_data.get('model'),
-# #             extract_additional=validated_data.get('extract_additional', True)
-# #         )
-
-# #         # Return response
-# #         response_serializer = ProductAttributeResponseSerializer(data=result)
-# #         if response_serializer.is_valid():
-# #             return Response(response_serializer.data, status=status.HTTP_200_OK)
-        
-# #         return Response(result, status=status.HTTP_200_OK)
-
-
-
-
-
-
-
-# from rest_framework.views import APIView
-# from rest_framework.response import Response
-# from rest_framework import status
-# from .serializers import (
-#     SingleProductRequestSerializer,
-#     BatchProductRequestSerializer,
-#     ProductAttributeResultSerializer,
-#     BatchProductResponseSerializer
-# )
-# from .services import ProductAttributeService
-
-
-# class ExtractProductAttributesView(APIView):
-#     """
-#     API endpoint to extract product attributes for a single product.
-    
-#     POST /api/extract-attributes/
-    
-#     Request Body:
-#     {
-#         "title": "Product title (optional)",
-#         "short_desc": "Short description (optional)",
-#         "long_desc": "Long description (optional)",
-#         "mandatory_attrs": {
-#             "Attribute1": ["value1", "value2", "value3"],
-#             "Attribute2": ["valueA", "valueB"]
-#         },
-#         "model": "llama-3.1-8b-instant (optional)",
-#         "extract_additional": true (optional, default: true)
-#     }
-#     """
-
-#     def post(self, request):
-#         serializer = SingleProductRequestSerializer(data=request.data)
-#         if not serializer.is_valid():
-#             return Response(
-#                 {"error": serializer.errors},
-#                 status=status.HTTP_400_BAD_REQUEST
-#             )
-
-#         validated_data = serializer.validated_data
-
-#         product_text = ProductAttributeService.combine_product_text(
-#             title=validated_data.get('title'),
-#             short_desc=validated_data.get('short_desc'),
-#             long_desc=validated_data.get('long_desc')
-#         )
-
-#         result = ProductAttributeService.extract_attributes(
-#             product_text=product_text,
-#             mandatory_attrs=validated_data['mandatory_attrs'],
-#             model=validated_data.get('model'),
-#             extract_additional=validated_data.get('extract_additional', True)
-#         )
-
-#         response_serializer = ProductAttributeResultSerializer(data=result)
-#         if response_serializer.is_valid():
-#             return Response(response_serializer.data, status=status.HTTP_200_OK)
-        
-#         return Response(result, status=status.HTTP_200_OK)
-
-
-# class BatchExtractProductAttributesView(APIView):
-#     """
-#     API endpoint to extract product attributes for multiple products in batch.
-    
-#     POST /api/batch-extract-attributes/
-    
-#     Request Body:
-#     {
-#         "products": [
-#             {
-#                 "product_id": "prod_001",
-#                 "title": "Product 1 title",
-#                 "short_desc": "Short description",
-#                 "long_desc": "Long description"
-#             },
-#             {
-#                 "product_id": "prod_002",
-#                 "title": "Product 2 title",
-#                 "short_desc": "Short description"
-#             }
-#         ],
-#         "mandatory_attrs": {
-#             "Attribute1": ["value1", "value2", "value3"],
-#             "Attribute2": ["valueA", "valueB"]
-#         },
-#         "model": "llama-3.1-8b-instant (optional)",
-#         "extract_additional": true (optional, default: true)
-#     }
-    
-#     Response:
-#     {
-#         "results": [
-#             {
-#                 "product_id": "prod_001",
-#                 "mandatory": {...},
-#                 "additional": {...}
-#             },
-#             {
-#                 "product_id": "prod_002",
-#                 "mandatory": {...},
-#                 "additional": {...}
-#             }
-#         ],
-#         "total_products": 2,
-#         "successful": 2,
-#         "failed": 0
-#     }
-#     """
-
-#     def post(self, request):
-#         serializer = BatchProductRequestSerializer(data=request.data)
-#         if not serializer.is_valid():
-#             return Response(
-#                 {"error": serializer.errors},
-#                 status=status.HTTP_400_BAD_REQUEST
-#             )
-
-#         validated_data = serializer.validated_data
-
-#         # Extract attributes for all products in batch
-#         result = ProductAttributeService.extract_attributes_batch(
-#             products=validated_data['products'],
-#             mandatory_attrs=validated_data['mandatory_attrs'],
-#             model=validated_data.get('model'),
-#             extract_additional=validated_data.get('extract_additional', True)
-#         )
-
-#         response_serializer = BatchProductResponseSerializer(data=result)
-#         if response_serializer.is_valid():
-#             return Response(response_serializer.data, status=status.HTTP_200_OK)
-        
-#         return Response(result, status=status.HTTP_200_OK)
-
-
-
-
-
-
-
-# ==================== views.py ====================
 from rest_framework.views import APIView
 from rest_framework.response import Response
 from rest_framework import status
+from rest_framework.parsers import MultiPartParser, FormParser
+from django.db import transaction
+import pandas as pd
+from .models import Product, ProductType, ProductAttribute, AttributePossibleValue
 from .serializers import (
     SingleProductRequestSerializer,
     BatchProductRequestSerializer,
-    ProductAttributeResultSerializer,
-    BatchProductResponseSerializer
+    ProductAttributeResultSerializer, 
+    BatchProductResponseSerializer,
+    ProductSerializer,
+    ProductTypeSerializer,
+    ProductAttributeSerializer,
+    AttributePossibleValueSerializer
 )
 from .services import ProductAttributeService
 from .ocr_service import OCRService
 
 
-from .models import Product
-
 class ExtractProductAttributesView(APIView):
     """
     API endpoint to extract product attributes for a single product by item_id.
-    Fetches product details from database.
+    Fetches product details from database with source tracking.
+    Returns attributes in array format: [{"value": "...", "source": "..."}]
     """
 
     def post(self, request):
@@ -285,20 +68,27 @@ class ExtractProductAttributesView(APIView):
                     for item in ocr_results["detected_text"]
                 ])
 
-        # Combine all product text
-        product_text = ProductAttributeService.combine_product_text(
+        # Combine all product text with source tracking
+        product_text, source_map = ProductAttributeService.combine_product_text(
             title=title,
             short_desc=short_desc,
             long_desc=long_desc,
             ocr_text=ocr_text
         )
 
-        # Extract attributes
+        # Extract attributes with enhanced features and source tracking
         result = ProductAttributeService.extract_attributes(
             product_text=product_text,
             mandatory_attrs=validated_data["mandatory_attrs"],
+            source_map=source_map,
             model=validated_data.get("model"),
-            extract_additional=validated_data.get("extract_additional", True)
+            extract_additional=validated_data.get("extract_additional", True),
+            multiple=validated_data.get("multiple", []),
+            threshold_abs=validated_data.get("threshold_abs", 0.65),
+            margin=validated_data.get("margin", 0.15),
+            use_dynamic_thresholds=validated_data.get("use_dynamic_thresholds", True),
+            use_adaptive_margin=validated_data.get("use_adaptive_margin", True),
+            use_semantic_clustering=validated_data.get("use_semantic_clustering", True)
         )
 
         # Attach OCR results if available
@@ -312,118 +102,11 @@ class ExtractProductAttributesView(APIView):
         return Response(result, status=status.HTTP_200_OK)
 
 
-from .models import Product
-
-# class BatchExtractProductAttributesView(APIView):
-#     """
-#     API endpoint to extract product attributes for multiple products in batch by item_id.
-#     Fetches all product details from database automatically.
-#     """
-
-#     def post(self, request):
-#         serializer = BatchProductRequestSerializer(data=request.data)
-#         if not serializer.is_valid():
-#             return Response({"error": serializer.errors}, status=status.HTTP_400_BAD_REQUEST)
-
-#         validated_data = serializer.validated_data
-#         item_ids = validated_data.get("item_ids", [])
-#         model = validated_data.get("model")
-#         extract_additional = validated_data.get("extract_additional", True)
-#         process_image = validated_data.get("process_image", True)
-#         mandatory_attrs = validated_data["mandatory_attrs"]
-
-#         # Fetch all products in one query
-#         products = Product.objects.filter(item_id__in=item_ids)
-#         found_ids = set(products.values_list("item_id", flat=True))
-#         missing_ids = [pid for pid in item_ids if pid not in found_ids]
-
-#         results = []
-#         successful = 0
-#         failed = 0
-
-#         for product in products:
-#             try:
-#                 title = product.product_name
-#                 short_desc = product.product_short_description
-#                 long_desc = product.product_long_description
-#                 image_url = product.image_path
-
-#                 ocr_results = None
-#                 ocr_text = None
-
-#                 if process_image and image_url:
-#                     ocr_service = OCRService()
-#                     ocr_results = ocr_service.process_image(image_url)
-
-#                     if ocr_results and ocr_results.get("detected_text"):
-#                         ocr_attrs = ProductAttributeService.extract_attributes_from_ocr(
-#                             ocr_results, model
-#                         )
-#                         ocr_results["extracted_attributes"] = ocr_attrs
-#                         ocr_text = "\n".join([
-#                             f"{item['text']} (confidence: {item['confidence']:.2f})"
-#                             for item in ocr_results["detected_text"]
-#                         ])
-
-#                 product_text = ProductAttributeService.combine_product_text(
-#                     title=title,
-#                     short_desc=short_desc,
-#                     long_desc=long_desc,
-#                     ocr_text=ocr_text
-#                 )
-
-#                 extracted = ProductAttributeService.extract_attributes(
-#                     product_text=product_text,
-#                     mandatory_attrs=mandatory_attrs,
-#                     model=model,
-#                     extract_additional=extract_additional
-#                 )
-
-#                 result = {
-#                     "product_id": product.item_id,
-#                     "mandatory": extracted.get("mandatory", {}),
-#                     "additional": extracted.get("additional", {}),
-#                 }
-
-#                 if ocr_results:
-#                     result["ocr_results"] = ocr_results
-
-#                 results.append(result)
-#                 successful += 1
-
-#             except Exception as e:
-#                 failed += 1
-#                 results.append({
-#                     "product_id": product.item_id,
-#                     "error": str(e)
-#                 })
-
-#         # Add missing item_ids as failed entries
-#         for mid in missing_ids:
-#             failed += 1
-#             results.append({
-#                 "product_id": mid,
-#                 "error": "Product not found in database"
-#             })
-
-#         batch_result = {
-#             "results": results,
-#             "total_products": len(item_ids),
-#             "successful": successful,
-#             "failed": failed
-#         }
-
-#         response_serializer = BatchProductResponseSerializer(data=batch_result)
-#         if response_serializer.is_valid():
-#             return Response(response_serializer.data, status=status.HTTP_200_OK)
-
-#         return Response(batch_result, status=status.HTTP_200_OK)
-
-
 class BatchExtractProductAttributesView(APIView):
     """
     API endpoint to extract product attributes for multiple products in batch.
-    Uses item-specific mandatory_attrs.
+    Uses item-specific mandatory_attrs with source tracking.
+    Returns attributes in array format: [{"value": "...", "source": "..."}]
     """
 
     def post(self, request):
@@ -434,10 +117,16 @@ class BatchExtractProductAttributesView(APIView):
         validated_data = serializer.validated_data
         
         # Get batch-level settings
-        product_list = validated_data.get("products", []) # New: list of {item_id, mandatory_attrs}
+        product_list = validated_data.get("products", [])
         model = validated_data.get("model")
         extract_additional = validated_data.get("extract_additional", True)
         process_image = validated_data.get("process_image", True)
+        multiple = validated_data.get("multiple", [])
+        threshold_abs = validated_data.get("threshold_abs", 0.65)
+        margin = validated_data.get("margin", 0.15)
+        use_dynamic_thresholds = validated_data.get("use_dynamic_thresholds", True)
+        use_adaptive_margin = validated_data.get("use_adaptive_margin", True)
+        use_semantic_clustering = validated_data.get("use_semantic_clustering", True)
         
         # Extract all item_ids to query the database efficiently
         item_ids = [p['item_id'] for p in product_list] 
@@ -464,7 +153,7 @@ class BatchExtractProductAttributesView(APIView):
                     "product_id": item_id,
                     "error": "Product not found in database"
                 })
-                continue # Skip to the next product
+                continue
 
             product = product_map[item_id]
             
@@ -477,14 +166,12 @@ class BatchExtractProductAttributesView(APIView):
                 ocr_results = None
                 ocr_text = None
 
-                # Image Processing Logic (same as before)
+                # Image Processing Logic
                 if process_image and image_url:
                     ocr_service = OCRService()
                     ocr_results = ocr_service.process_image(image_url)
 
                     if ocr_results and ocr_results.get("detected_text"):
-                        # Ensure the services are designed to handle 'mandatory_attrs'
-                        # for attribute extraction from OCR text
                         ocr_attrs = ProductAttributeService.extract_attributes_from_ocr(
                             ocr_results, model
                         )
@@ -494,19 +181,27 @@ class BatchExtractProductAttributesView(APIView):
                             for item in ocr_results["detected_text"]
                         ])
 
-                product_text = ProductAttributeService.combine_product_text(
+                # Combine product text with source tracking
+                product_text, source_map = ProductAttributeService.combine_product_text(
                     title=title,
                     short_desc=short_desc,
                     long_desc=long_desc,
                     ocr_text=ocr_text
                 )
 
-                # Attribute Extraction Logic - NOW USING ITEM-SPECIFIC mandatory_attrs
+                # Attribute Extraction with source tracking (returns array format)
                 extracted = ProductAttributeService.extract_attributes(
                     product_text=product_text,
-                    mandatory_attrs=mandatory_attrs, # <--- Changed: now item-specific
+                    mandatory_attrs=mandatory_attrs,
+                    source_map=source_map,
                     model=model,
-                    extract_additional=extract_additional
+                    extract_additional=extract_additional,
+                    multiple=multiple,
+                    threshold_abs=threshold_abs,
+                    margin=margin,
+                    use_dynamic_thresholds=use_dynamic_thresholds,
+                    use_adaptive_margin=use_adaptive_margin,
+                    use_semantic_clustering=use_semantic_clustering
                 )
 
                 result = {
@@ -528,10 +223,6 @@ class BatchExtractProductAttributesView(APIView):
                     "error": str(e)
                 })
 
-        # No need for a separate missing_ids loop since we handle it when iterating over product_list
-        # The list comprehension `item_ids = [p['item_id'] for p in product_list]` and the check 
-        # `if item_id not in found_ids:` now correctly handle missing products from the input list.
-
         batch_result = {
             "results": results,
             "total_products": len(product_list),
@@ -546,13 +237,6 @@ class BatchExtractProductAttributesView(APIView):
         return Response(batch_result, status=status.HTTP_200_OK)
 
 
-
-from rest_framework.views import APIView
-from rest_framework.response import Response
-from rest_framework import status
-from .models import Product
-from .serializers import ProductSerializer
-
 class ProductListView(APIView):
     """
     GET API to list all products with details
@@ -563,17 +247,6 @@ class ProductListView(APIView):
         return Response(serializer.data, status=status.HTTP_200_OK)
 
 
-
-
-import pandas as pd
-from rest_framework.parsers import MultiPartParser, FormParser
-from rest_framework.views import APIView
-from rest_framework.response import Response
-from rest_framework import status
-from .models import Product
-from .serializers import ProductSerializer
-
-
 class ProductUploadExcelView(APIView):
     """
     POST API to upload an Excel file and add data to Product model (skip duplicates)
@@ -586,7 +259,6 @@ class ProductUploadExcelView(APIView):
             return Response({'error': 'No file provided'}, status=status.HTTP_400_BAD_REQUEST)
 
         try:
-            import pandas as pd
             df = pd.read_excel(file_obj)
             df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]
 
@@ -635,16 +307,6 @@ class ProductUploadExcelView(APIView):
             return Response({'error': str(e)}, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
 
 
-
-import pandas as pd
-from rest_framework.views import APIView
-from rest_framework.response import Response
-from rest_framework import status
-from rest_framework.parsers import MultiPartParser, FormParser
-from .models import ProductType, ProductAttribute, AttributePossibleValue
-
-
-
 class ProductAttributesUploadView(APIView):
     """
     POST API to upload an Excel file and add mandatory/additional attributes
@@ -696,16 +358,6 @@ class ProductAttributesUploadView(APIView):
             return Response({"error": str(e)}, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
 
 
-
-
-
-from rest_framework.views import APIView
-from rest_framework.response import Response
-from rest_framework import status
-from .models import ProductType, ProductAttribute, AttributePossibleValue
-from .serializers import ProductTypeSerializer, ProductAttributeSerializer, AttributePossibleValueSerializer
-from django.db import transaction
-
 class ProductTypeAttributesView(APIView):
     """
     API to view, create, update, and delete product type attributes and their possible values.
@@ -738,9 +390,9 @@ class ProductTypeAttributesView(APIView):
         Expected payload example:
         {
             "product_type": "Hardware Screws",
-            "attribute_name": "Material",  // Optional if only creating product type
-            "is_mandatory": "Yes",        // Optional if only creating product type
-            "possible_values": "Steel, Zinc Plated, Stainless Steel"  // Optional
+            "attribute_name": "Material",
+            "is_mandatory": "Yes",
+            "possible_values": "Steel, Zinc Plated, Stainless Steel"
         }
         """
         try:
@@ -759,7 +411,6 @@ class ProductTypeAttributesView(APIView):
                 product_type, created = ProductType.objects.get_or_create(name=product_type_name)
 
                 if created and not attribute_name:
-                    # Only product type was created
                     return Response({
                         "message": f"Product type '{product_type_name}' created successfully",
                         "data": {"product_type": product_type_name}
@@ -868,7 +519,7 @@ class ProductTypeAttributesView(APIView):
         Expected payload example:
         {
             "product_type": "Hardware Screws",
-            "attribute_name": "Material"  // Optional: if omitted, deletes entire product type
+            "attribute_name": "Material"
         }
         """
         try:
@@ -904,7 +555,7 @@ class ProductTypeAttributesView(APIView):
                             "error": f"Attribute '{attribute_name}' not found for product type '{product_type_name}'"
                         }, status=status.HTTP_404_NOT_FOUND)
                 else:
-                    # Delete entire product type (and its attributes and possible values)
+                    # Delete entire product type
                     product_type.delete()
                     return Response({
                         "message": f"Product type '{product_type_name}' and all its attributes deleted successfully"
@@ -912,28 +563,12 @@ class ProductTypeAttributesView(APIView):
 
         except Exception as e:
             return Response({"error": str(e)}, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
-        
 
 
-
-
-# views.py
-from rest_framework.views import APIView
-from rest_framework.response import Response
-from rest_framework import status
-from .models import ProductType
-from .serializers import ProductTypeSerializer
-
-
-from rest_framework.views import APIView
-from rest_framework.response import Response
-from rest_framework import status
-from .models import ProductType
-
 class ProductTypeListView(APIView):
     """
     GET API to list all product types (only names).
     """
     def get(self, request):
         product_types = ProductType.objects.values_list('name', flat=True)
-        return Response({"product_types": list(product_types)}, status=status.HTTP_200_OK)
+        return Response({"product_types": list(product_types)}, status=status.HTTP_200_OK)

BIN
db.sqlite3


Einige Dateien werden nicht angezeigt, da zu viele Dateien in diesem Diff geändert wurden.