浏览代码

cleaning files

Harshit Pathak 3 月之前
父节点
当前提交
e3bba06e75
共有 2 个文件被更改,包括 59 次插入2816 次删除
  1. 0 2300
      attr_extraction/services.py
  2. 59 516
      attr_extraction/views.py

+ 0 - 2300
attr_extraction/services.py

@@ -1,2303 +1,3 @@
-
-
-
-# # # ==================== services.py ====================
-# # import requests
-# # import json
-# # import re
-# # from typing import Dict, List, Optional, Tuple
-# # from django.conf import settings
-# # from concurrent.futures import ThreadPoolExecutor, as_completed
-# # from sentence_transformers import SentenceTransformer, util
-# # import numpy as np
-# # from .ocr_service import OCRService
-
-
-# # # Initialize embedding model for normalization
-# # model_embedder = SentenceTransformer("all-MiniLM-L6-v2")
-
-
-# # class ProductAttributeService:
-# #     """Service class for extracting product attributes using Groq LLM."""
-
-# #     @staticmethod
-# #     def normalize_dimension_text(text: str) -> str:
-# #         """
-# #         Normalize dimension text to match format like '16x20', '20x30', etc.
-# #         Handles formats like '16 x 20', '16x1.5x20', '16 x 1.5 x 20 Inches'
-# #         Returns the normalized dimension (e.g., '16x20') or empty string if not found.
-# #         """
-# #         if not text:
-# #             return ""
-        
-# #         # Convert to lowercase and remove common units
-# #         text = text.lower()
-# #         text = re.sub(r'\s*(inches|inch|in|cm|centimeters|mm|millimeters)\s*', '', text, flags=re.IGNORECASE)
-        
-# #         # Extract all numbers from the text
-# #         numbers = re.findall(r'\d+\.?\d*', text)
-        
-# #         if not numbers:
-# #             return ""
-        
-# #         # Convert to floats first to handle decimals properly
-# #         float_numbers = []
-# #         for num in numbers:
-# #             try:
-# #                 float_numbers.append(float(num))
-# #             except:
-# #                 continue
-        
-# #         if len(float_numbers) < 2:
-# #             return ""
-        
-# #         # If we have 3 dimensions, it's likely Width x Depth x Height
-# #         # For wall art, depth is usually small (< 5), so we keep first and last
-# #         if len(float_numbers) == 3:
-# #             # Keep first and last values (width and height), skip middle (depth)
-# #             float_numbers = [float_numbers[0], float_numbers[2]]
-# #         elif len(float_numbers) > 3:
-# #             # If more than 3 dimensions, keep the two largest
-# #             float_numbers = sorted(float_numbers)[-2:]
-# #         else:
-# #             # Just 2 dimensions, use as is
-# #             float_numbers = float_numbers[:2]
-        
-# #         # Format numbers: use integer if whole, else one decimal
-# #         formatted_numbers = []
-# #         for num in float_numbers:
-# #             if num.is_integer():
-# #                 formatted_numbers.append(str(int(num)))
-# #             else:
-# #                 formatted_numbers.append(f"{num:.1f}")
-        
-# #         # Sort to ensure consistent order (smaller x larger)
-# #         formatted_numbers.sort(key=lambda x: float(x))
-        
-# #         # Return formatted dimension
-# #         return f"{formatted_numbers[0]}x{formatted_numbers[1]}"
-
-    
-    
-# #     @staticmethod
-# #     def normalize_value_for_matching(value: str, attr_name: str = "") -> str:
-# #         """
-# #         Normalize a value based on its attribute type for better matching.
-# #         Currently handles dimensions specially, can be extended for other attributes.
-# #         """
-# #         # Check if this is a dimension-related attribute
-# #         dimension_keywords = ['dimension', 'size', 'measurement']
-# #         if any(keyword in attr_name.lower() for keyword in dimension_keywords):
-# #             normalized = ProductAttributeService.normalize_dimension_text(value)
-# #             if normalized:
-# #                 return normalized
-        
-# #         # For other attributes, just return cleaned value
-# #         return value.strip()
-
-# #     @staticmethod
-# #     def combine_product_text(
-# #         title: Optional[str] = None,
-# #         short_desc: Optional[str] = None,
-# #         long_desc: Optional[str] = None,
-# #         ocr_text: Optional[str] = None
-# #     ) -> Tuple[str, Dict[str, str]]:
-# #         """
-# #         Combine product metadata into a single text block.
-# #         Returns: (combined_text, source_map) where source_map tracks which text came from where
-# #         """
-# #         parts = []
-# #         source_map = {}
-        
-# #         if title:
-# #             title_str = str(title).strip()
-# #             parts.append(f"Title: {title_str}")
-# #             source_map['title'] = title_str
-# #         if short_desc:
-# #             short_str = str(short_desc).strip()
-# #             parts.append(f"Description: {short_str}")
-# #             source_map['short_desc'] = short_str
-# #         if long_desc:
-# #             long_str = str(long_desc).strip()
-# #             parts.append(f"Details: {long_str}")
-# #             source_map['long_desc'] = long_str
-# #         if ocr_text:
-# #             parts.append(f"OCR Text: {ocr_text}")
-# #             source_map['ocr_text'] = ocr_text
-        
-# #         combined = "\n".join(parts).strip()
-        
-# #         if not combined:
-# #             return "No product information available", {}
-        
-# #         return combined, source_map
-
-# #     @staticmethod
-# #     def find_value_source(value: str, source_map: Dict[str, str], attr_name: str = "") -> str:
-# #         """
-# #         Find which source(s) contain the given value.
-# #         Returns the source name(s) where the value appears.
-# #         Now handles normalized matching for dimensions.
-# #         """
-# #         value_lower = value.lower()
-# #         # Split value into tokens for better matching
-# #         value_tokens = set(value_lower.replace("-", " ").replace("x", " ").split())
-        
-# #         # Check if this is a dimension-related attribute
-# #         is_dimension_attr = any(keyword in attr_name.lower() for keyword in ['dimension', 'size', 'measurement'])
-        
-# #         sources_found = []
-# #         source_scores = {}
-        
-# #         for source_name, source_text in source_map.items():
-# #             source_lower = source_text.lower()
-            
-# #             # Check for exact phrase match first
-# #             if value_lower in source_lower:
-# #                 source_scores[source_name] = 1.0
-# #                 continue
-            
-# #             # For dimensions, check normalized match
-# #             if is_dimension_attr:
-# #                 # Normalize the value (e.g., "16x20" stays "16x20")
-# #                 normalized_value = ProductAttributeService.normalize_dimension_text(value)
-# #                 if not normalized_value:
-# #                     normalized_value = value.replace("x", " ").strip()
-                
-# #                 # Normalize the source text to extract dimensions
-# #                 normalized_source = ProductAttributeService.normalize_dimension_text(source_text)
-                
-# #                 # Direct match
-# #                 if normalized_value == normalized_source:
-# #                     source_scores[source_name] = 0.95
-# #                     continue
-                
-# #                 # Also check if the dimension numbers appear in the source
-# #                 # Extract dimension parts (e.g., "16x20" -> ["16", "20"])
-# #                 dim_parts = normalized_value.split("x") if "x" in normalized_value else []
-# #                 if len(dim_parts) == 2:
-# #                     # Check if both numbers appear in the source
-# #                     if all(part in source_text for part in dim_parts):
-# #                         source_scores[source_name] = 0.85
-# #                         continue
-            
-# #             # Check for token matches
-# #             token_matches = sum(1 for token in value_tokens if token and token in source_lower)
-# #             if token_matches > 0 and len(value_tokens) > 0:
-# #                 source_scores[source_name] = token_matches / len(value_tokens)
-        
-# #         # Return source with highest score, or all sources if multiple have same score
-# #         if source_scores:
-# #             max_score = max(source_scores.values())
-# #             sources_found = [s for s, score in source_scores.items() if score == max_score]
-            
-# #             # Prioritize: title > short_desc > long_desc > ocr_text
-# #             priority = ['title', 'short_desc', 'long_desc', 'ocr_text']
-# #             for p in priority:
-# #                 if p in sources_found:
-# #                     return p
-            
-# #             return sources_found[0] if sources_found else "Not found"
-        
-# #         return "Not found"
-
-# #     @staticmethod
-# #     def format_visual_attributes(visual_attributes: Dict) -> Dict:
-# #         """
-# #         Convert visual attributes to array format with source tracking.
-# #         Source is always 'image' for visual attributes.
-# #         """
-# #         formatted = {}
-        
-# #         for key, value in visual_attributes.items():
-# #             if isinstance(value, list):
-# #                 # Already a list (like color_palette)
-# #                 formatted[key] = [{"value": str(item), "source": "image"} for item in value]
-# #             elif isinstance(value, dict):
-# #                 # Nested dictionary - format recursively
-# #                 nested_formatted = {}
-# #                 for nested_key, nested_value in value.items():
-# #                     if isinstance(nested_value, list):
-# #                         nested_formatted[nested_key] = [{"value": str(item), "source": "image"} for item in nested_value]
-# #                     else:
-# #                         nested_formatted[nested_key] = [{"value": str(nested_value), "source": "image"}]
-# #                 formatted[key] = nested_formatted
-# #             else:
-# #                 # Single value
-# #                 formatted[key] = [{"value": str(value), "source": "image"}]
-        
-# #         return formatted
-
-# #     @staticmethod
-# #     def extract_attributes_from_ocr(ocr_results: Dict, model: str = None) -> Dict:
-# #         """Extract structured attributes from OCR text using LLM."""
-# #         if model is None:
-# #             model = settings.SUPPORTED_MODELS[0]
-        
-# #         detected_text = ocr_results.get('detected_text', [])
-# #         if not detected_text:
-# #             return {}
-        
-# #         # Format OCR text for prompt
-# #         ocr_text = "\n".join([f"Text: {item['text']}, Confidence: {item['confidence']:.2f}" 
-# #                               for item in detected_text])
-        
-# #         prompt = f"""
-# # You are an AI model that extracts structured attributes from OCR text detected on product images.
-# # Given the OCR detections below, infer the possible product attributes and return them as a clean JSON object.
-
-# # OCR Text:
-# # {ocr_text}
-
-# # Extract relevant attributes like:
-# # - brand
-# # - model_number
-# # - size (waist_size, length, etc.)
-# # - collection
-# # - any other relevant product information
-
-# # Return a JSON object with only the attributes you can confidently identify.
-# # If an attribute is not present, do not include it in the response.
-# # """
-        
-# #         payload = {
-# #             "model": model,
-# #             "messages": [
-# #                 {
-# #                     "role": "system",
-# #                     "content": "You are a helpful AI that extracts structured data from OCR output. Return only valid JSON."
-# #                 },
-# #                 {"role": "user", "content": prompt}
-# #             ],
-# #             "temperature": 0.2,
-# #             "max_tokens": 500
-# #         }
-        
-# #         headers = {
-# #             "Authorization": f"Bearer {settings.GROQ_API_KEY}",
-# #             "Content-Type": "application/json",
-# #         }
-        
-# #         try:
-# #             response = requests.post(
-# #                 settings.GROQ_API_URL,
-# #                 headers=headers,
-# #                 json=payload,
-# #                 timeout=30
-# #             )
-# #             response.raise_for_status()
-# #             result_text = response.json()["choices"][0]["message"]["content"].strip()
-            
-# #             # Clean and parse JSON
-# #             result_text = ProductAttributeService._clean_json_response(result_text)
-# #             parsed = json.loads(result_text)
-            
-# #             # Convert to array format with source tracking
-# #             formatted_attributes = {}
-# #             for key, value in parsed.items():
-# #                 if key == "error":
-# #                     continue
-                
-# #                 # Handle nested dictionaries (like size)
-# #                 if isinstance(value, dict):
-# #                     nested_formatted = {}
-# #                     for nested_key, nested_value in value.items():
-# #                         nested_formatted[nested_key] = [{"value": str(nested_value), "source": "image"}]
-# #                     formatted_attributes[key] = nested_formatted
-# #                 elif isinstance(value, list):
-# #                     # Already a list, convert each item
-# #                     formatted_attributes[key] = [{"value": str(item), "source": "image"} for item in value]
-# #                 else:
-# #                     # Single value
-# #                     formatted_attributes[key] = [{"value": str(value), "source": "image"}]
-            
-# #             return formatted_attributes
-# #         except Exception as e:
-# #             return {"error": f"Failed to extract attributes from OCR: {str(e)}"}
-
-# #     @staticmethod
-# #     def calculate_attribute_relationships(
-# #         mandatory_attrs: Dict[str, List[str]],
-# #         product_text: str
-# #     ) -> Dict[str, float]:
-# #         """
-# #         Calculate semantic relationships between attribute values across different attributes.
-# #         Returns a matrix of cross-attribute value similarities.
-# #         """
-# #         pt_emb = model_embedder.encode(product_text, convert_to_tensor=True)
-
-# #         # Calculate similarities between all attribute values and product text
-# #         attr_scores = {}
-# #         for attr, values in mandatory_attrs.items():
-# #             attr_scores[attr] = {}
-# #             for val in values:
-# #                 contexts = [val, f"for {val}", f"use in {val}", f"suitable for {val}"]
-# #                 ctx_embs = [model_embedder.encode(c, convert_to_tensor=True) for c in contexts]
-# #                 sem_sim = max(float(util.cos_sim(pt_emb, ce).item()) for ce in ctx_embs)
-# #                 attr_scores[attr][val] = sem_sim
-
-# #         # Calculate cross-attribute value relationships
-# #         relationships = {}
-# #         attr_list = list(mandatory_attrs.keys())
-
-# #         for i, attr1 in enumerate(attr_list):
-# #             for attr2 in attr_list[i+1:]:
-# #                 # Calculate pairwise similarities between values of different attributes
-# #                 for val1 in mandatory_attrs[attr1]:
-# #                     for val2 in mandatory_attrs[attr2]:
-# #                         emb1 = model_embedder.encode(val1, convert_to_tensor=True)
-# #                         emb2 = model_embedder.encode(val2, convert_to_tensor=True)
-# #                         sim = float(util.cos_sim(emb1, emb2).item())
-
-# #                         # Store bidirectional relationships
-# #                         key1 = f"{attr1}:{val1}->{attr2}:{val2}"
-# #                         key2 = f"{attr2}:{val2}->{attr1}:{val1}"
-# #                         relationships[key1] = sim
-# #                         relationships[key2] = sim
-
-# #         return relationships
-
-# #     @staticmethod
-# #     def calculate_value_clusters(
-# #         values: List[str],
-# #         scores: List[Tuple[str, float]],
-# #         cluster_threshold: float = 0.4
-# #     ) -> List[List[str]]:
-# #         """
-# #         Group values into semantic clusters based on their similarity to each other.
-# #         Returns clusters of related values.
-# #         """
-# #         if len(values) <= 1:
-# #             return [[val] for val, _ in scores]
-
-# #         # Get embeddings for all values
-# #         embeddings = [model_embedder.encode(val, convert_to_tensor=True) for val in values]
-
-# #         # Calculate pairwise similarities
-# #         similarity_matrix = np.zeros((len(values), len(values)))
-# #         for i in range(len(values)):
-# #             for j in range(i+1, len(values)):
-# #                 sim = float(util.cos_sim(embeddings[i], embeddings[j]).item())
-# #                 similarity_matrix[i][j] = sim
-# #                 similarity_matrix[j][i] = sim
-
-# #         # Simple clustering: group values with high similarity
-# #         clusters = []
-# #         visited = set()
-
-# #         for i, (val, score) in enumerate(scores):
-# #             if i in visited:
-# #                 continue
-
-# #             cluster = [val]
-# #             visited.add(i)
-
-# #             # Find similar values
-# #             for j in range(len(values)):
-# #                 if j not in visited and similarity_matrix[i][j] >= cluster_threshold:
-# #                     cluster.append(values[j])
-# #                     visited.add(j)
-
-# #             clusters.append(cluster)
-
-# #         return clusters
-
-# #     @staticmethod
-# #     def get_dynamic_threshold(
-# #         attr: str,
-# #         val: str,
-# #         base_score: float,
-# #         extracted_attrs: Dict[str, List[Dict[str, str]]],
-# #         relationships: Dict[str, float],
-# #         mandatory_attrs: Dict[str, List[str]],
-# #         base_threshold: float = 0.65,
-# #         boost_factor: float = 0.15
-# #     ) -> float:
-# #         """
-# #         Calculate dynamic threshold based on relationships with already-extracted attributes.
-# #         """
-# #         threshold = base_threshold
-
-# #         # Check relationships with already extracted attributes
-# #         max_relationship = 0.0
-# #         for other_attr, other_values_list in extracted_attrs.items():
-# #             if other_attr == attr:
-# #                 continue
-
-# #             for other_val_dict in other_values_list:
-# #                 other_val = other_val_dict['value']
-# #                 key = f"{attr}:{val}->{other_attr}:{other_val}"
-# #                 if key in relationships:
-# #                     max_relationship = max(max_relationship, relationships[key])
-
-# #         # If strong relationship exists, lower threshold
-# #         if max_relationship > 0.6:
-# #             threshold = base_threshold - (boost_factor * max_relationship)
-
-# #         return max(0.3, threshold)
-
-# #     @staticmethod
-# #     def get_adaptive_margin(
-# #         scores: List[Tuple[str, float]],
-# #         base_margin: float = 0.15,
-# #         max_margin: float = 0.22
-# #     ) -> float:
-# #         """
-# #         Calculate adaptive margin based on score distribution.
-# #         """
-# #         if len(scores) < 2:
-# #             return base_margin
-
-# #         score_values = [s for _, s in scores]
-# #         best_score = score_values[0]
-
-# #         # If best score is very low, use adaptive margin but be more conservative
-# #         if best_score < 0.5:
-# #             # Calculate score spread in top 3-4 scores only (more selective)
-# #             top_scores = score_values[:min(4, len(score_values))]
-# #             score_range = max(top_scores) - min(top_scores)
-
-# #             # Very controlled margin increase
-# #             if score_range < 0.30:
-# #                 # Much more conservative scaling
-# #                 score_factor = (0.5 - best_score) * 0.35
-# #                 adaptive = base_margin + score_factor + (0.30 - score_range) * 0.2
-# #                 return min(adaptive, max_margin)
-
-# #         return base_margin
-
-# #     @staticmethod
-# #     def _lexical_evidence(product_text: str, label: str) -> float:
-# #         """Calculate lexical overlap between product text and label."""
-# #         pt = product_text.lower()
-# #         tokens = [t for t in label.lower().replace("-", " ").split() if t]
-# #         if not tokens:
-# #             return 0.0
-# #         hits = sum(1 for t in tokens if t in pt)
-# #         return hits / len(tokens)
-
-# #     @staticmethod
-# #     def normalize_against_product_text(
-# #     product_text: str,
-# #     mandatory_attrs: Dict[str, List[str]],
-# #     source_map: Dict[str, str],
-# #     threshold_abs: float = 0.65,
-# #     margin: float = 0.15,
-# #     allow_multiple: bool = False,
-# #     sem_weight: float = 0.8,
-# #     lex_weight: float = 0.2,
-# #     extracted_attrs: Optional[Dict[str, List[Dict[str, str]]]] = None,
-# #     relationships: Optional[Dict[str, float]] = None,
-# #     use_dynamic_thresholds: bool = True,
-# #     use_adaptive_margin: bool = True,
-# #     use_semantic_clustering: bool = True
-# # ) -> dict:
-# #         """
-# #         Score each allowed value against the product_text with dynamic thresholds.
-# #         Returns dict with values in array format: [{"value": "...", "source": "..."}]
-# #         """
-# #         if extracted_attrs is None:
-# #             extracted_attrs = {}
-# #         if relationships is None:
-# #             relationships = {}
-
-# #         pt_emb = model_embedder.encode(product_text, convert_to_tensor=True)
-# #         extracted = {}
-
-# #         for attr, allowed_values in mandatory_attrs.items():
-# #             scores: List[Tuple[str, float]] = []
-            
-# #             # Check if this is a dimension attribute
-# #             is_dimension_attr = any(keyword in attr.lower() for keyword in ['dimension', 'size', 'measurement'])
-            
-# #             # Normalize product text once for dimension matching
-# #             normalized_product_text = ProductAttributeService.normalize_dimension_text(product_text) if is_dimension_attr else ""
-
-# #             for val in allowed_values:
-# #                 # For dimension attributes, try exact normalized matching first
-# #                 if is_dimension_attr:
-# #                     # Normalize the allowed value from the list
-# #                     normalized_val = ProductAttributeService.normalize_dimension_text(val)
-                    
-# #                     # If we have both normalized values and they match exactly, give highest score
-# #                     if normalized_val and normalized_product_text and normalized_val == normalized_product_text:
-# #                         scores.append((val, 1.0))
-# #                         continue
-                    
-# #                     # Also check if the normalized value appears in the original product text
-# #                     # This handles cases where the format might be slightly different
-# #                     if normalized_val:
-# #                         # Extract just the numbers for flexible matching
-# #                         val_numbers = normalized_val.split('x')
-# #                         # Check if both numbers appear in the product text in close proximity
-# #                         text_lower = product_text.lower()
-# #                         if all(num in text_lower for num in val_numbers):
-# #                             # Calculate proximity score
-# #                             idx1 = text_lower.find(val_numbers[0])
-# #                             idx2 = text_lower.find(val_numbers[1])
-# #                             if idx1 != -1 and idx2 != -1:
-# #                                 distance = abs(idx2 - idx1)
-# #                                 # If numbers are close together (within 20 characters), high score
-# #                                 if distance < 20:
-# #                                     scores.append((val, 0.95))
-# #                                     continue
-                
-# #                 # Standard semantic matching for all attributes
-# #                 contexts = [val, f"for {val}", f"use in {val}", f"suitable for {val}", f"{val} room"]
-# #                 ctx_embs = [model_embedder.encode(c, convert_to_tensor=True) for c in contexts]
-# #                 sem_sim = max(float(util.cos_sim(pt_emb, ce).item()) for ce in ctx_embs)
-
-# #                 lex_score = ProductAttributeService._lexical_evidence(product_text, val)
-# #                 final_score = sem_weight * sem_sim + lex_weight * lex_score
-# #                 scores.append((val, final_score))
-
-# #             scores.sort(key=lambda x: x[1], reverse=True)
-# #             best_val, best_score = scores[0]
-
-# #             # Calculate adaptive margin if enabled
-# #             effective_margin = margin
-# #             if allow_multiple and use_adaptive_margin:
-# #                 effective_margin = ProductAttributeService.get_adaptive_margin(scores, margin)
-
-# #             # Special handling for dimension attributes with exact matches
-# #             # If we have a very high score (0.90+), it means we found an exact/normalized match
-# #             # In this case, don't apply multiple selection logic - just return the best match
-# #             if is_dimension_attr and best_score >= 0.90:
-# #                 source = ProductAttributeService.find_value_source(best_val, source_map, attr)
-# #                 extracted[attr] = [{"value": best_val, "source": source}]
-# #                 continue
-
-# #             if not allow_multiple:
-# #                 source = ProductAttributeService.find_value_source(best_val, source_map, attr)
-# #                 extracted[attr] = [{"value": best_val, "source": source}]
-# #             else:
-# #                 candidates = [best_val]
-# #                 use_base_threshold = best_score >= threshold_abs
-
-# #                 # Get semantic clusters if enabled
-# #                 clusters = []
-# #                 if use_semantic_clustering:
-# #                     clusters = ProductAttributeService.calculate_value_clusters(
-# #                         allowed_values, scores, cluster_threshold=0.4
-# #                     )
-# #                     best_cluster = next((c for c in clusters if best_val in c), [best_val])
-
-# #                 for val, sc in scores[1:]:
-# #                     # Skip values with very low scores
-# #                     min_score = 0.4 if is_dimension_attr else 0.3
-# #                     if sc < min_score:
-# #                         continue
-                    
-# #                     # Calculate dynamic threshold for this value
-# #                     if use_dynamic_thresholds and extracted_attrs:
-# #                         dynamic_thresh = ProductAttributeService.get_dynamic_threshold(
-# #                             attr, val, sc, extracted_attrs, relationships,
-# #                             mandatory_attrs, threshold_abs
-# #                         )
-# #                     else:
-# #                         dynamic_thresh = threshold_abs
-
-# #                     within_margin = (best_score - sc) <= effective_margin
-# #                     above_threshold = sc >= dynamic_thresh
-
-# #                     # Check if in same semantic cluster as best value
-# #                     in_cluster = False
-# #                     if use_semantic_clustering and clusters:
-# #                         in_cluster = any(best_val in c and val in c for c in clusters)
-
-# #                     if use_base_threshold:
-# #                         # Best score is good, require threshold OR (cluster + margin)
-# #                         if above_threshold and within_margin:
-# #                             candidates.append(val)
-# #                         elif in_cluster and within_margin:
-# #                             candidates.append(val)
-# #                     else:
-# #                         # Best score is low, use margin OR cluster logic
-# #                         if within_margin:
-# #                             candidates.append(val)
-# #                         elif in_cluster and (best_score - sc) <= effective_margin * 2.0:
-# #                             # Extended margin for cluster members
-# #                             candidates.append(val)
-
-# #                 # Map each candidate to its source and create array format
-# #                 extracted[attr] = []
-# #                 for candidate in candidates:
-# #                     source = ProductAttributeService.find_value_source(candidate, source_map, attr)
-# #                     extracted[attr].append({"value": candidate, "source": source})
-
-# #         return extracted
-
-
-# #     @staticmethod
-# #     def extract_attributes(
-# #         product_text: str,
-# #         mandatory_attrs: Dict[str, List[str]],
-# #         source_map: Dict[str, str] = None,
-# #         model: str = None,
-# #         extract_additional: bool = True,
-# #         multiple: Optional[List[str]] = None,
-# #         threshold_abs: float = 0.65,
-# #         margin: float = 0.15,
-# #         use_dynamic_thresholds: bool = True,
-# #         use_adaptive_margin: bool = True,
-# #         use_semantic_clustering: bool = True
-# #     ) -> dict:
-# #         """
-# #         Use Groq LLM to extract attributes from any product type with enhanced multi-value selection.
-# #         Now returns values in array format: [{"value": "...", "source": "..."}]
-# #         """
-        
-# #         if model is None:
-# #             model = settings.SUPPORTED_MODELS[0]
-
-# #         if multiple is None:
-# #             multiple = []
-
-# #         if source_map is None:
-# #             source_map = {}
-
-# #         # Check if product text is empty or minimal
-# #         if not product_text or product_text == "No product information available":
-# #             return ProductAttributeService._create_error_response(
-# #                 "No product information provided",
-# #                 mandatory_attrs,
-# #                 extract_additional
-# #             )
-
-# #         # Create structured prompt for mandatory attributes
-# #         mandatory_attr_list = []
-# #         for attr_name, allowed_values in mandatory_attrs.items():
-# #             mandatory_attr_list.append(f"{attr_name}: {', '.join(allowed_values)}")
-# #         mandatory_attr_text = "\n".join(mandatory_attr_list)
-
-# #         additional_instruction = ""
-# #         if extract_additional:
-# #             additional_instruction = """
-# # 2. Extract ADDITIONAL attributes: Identify any other relevant attributes from the product text 
-# #    that are NOT in the mandatory list. Only include attributes where you can find actual values
-# #    in the product text. Do NOT include attributes with "Not Specified" or empty values.
-   
-# #    Examples of attributes to look for (only if present): Brand, Material, Size, Color, Dimensions,
-# #    Weight, Features, Style, Theme, Pattern, Finish, Care Instructions, etc."""
-
-# #         output_format = {
-# #             "mandatory": {attr: "value or list of values" for attr in mandatory_attrs.keys()},
-# #         }
-
-# #         if extract_additional:
-# #             output_format["additional"] = {
-# #                 "example_attribute_1": "actual value found",
-# #                 "example_attribute_2": "actual value found"
-# #             }
-# #             output_format["additional"]["_note"] = "Only include attributes with actual values found in text"
-
-# #         prompt = f"""
-# # You are an intelligent product attribute extractor that works with ANY product type.
-
-# # TASK:
-# # 1. Extract MANDATORY attributes: For each mandatory attribute, select the most appropriate value(s)
-# #    from the provided list. Choose the value(s) that best match the product description.
-# # {additional_instruction}
-
-# # Product Text:
-# # {product_text}
-
-# # Mandatory Attribute Lists (MUST select from these allowed values):
-# # {mandatory_attr_text}
-
-# # CRITICAL INSTRUCTIONS:
-# # - Return ONLY valid JSON, nothing else
-# # - No explanations, no markdown, no text before or after the JSON
-# # - For mandatory attributes, choose the value(s) from the provided list that best match
-# # - If a mandatory attribute cannot be determined from the product text, use "Not Specified"
-# # - Prefer exact matches from the allowed values list over generic synonyms
-# # - If multiple values are plausible, you MAY return more than one
-# # {f"- For additional attributes: ONLY include attributes where you found actual values in the product text. DO NOT include attributes with 'Not Specified', 'None', 'N/A', or empty values. If you cannot find a value for an attribute, simply don't include that attribute." if extract_additional else ""}
-# # - Be precise and only extract information that is explicitly stated or clearly implied
-
-# # Required Output Format:
-# # {json.dumps(output_format, indent=2)}
-# #         """
-
-# #         payload = {
-# #             "model": model,
-# #             "messages": [
-# #                 {
-# #                     "role": "system",
-# #                     "content": f"You are a precise attribute extraction model. Return ONLY valid JSON with {'mandatory and additional' if extract_additional else 'mandatory'} sections. No explanations, no markdown, no other text."
-# #                 },
-# #                 {"role": "user", "content": prompt}
-# #             ],
-# #             "temperature": 0.0,
-# #             "max_tokens": 1500
-# #         }
-
-# #         headers = {
-# #             "Authorization": f"Bearer {settings.GROQ_API_KEY}",
-# #             "Content-Type": "application/json",
-# #         }
-
-# #         try:
-# #             response = requests.post(
-# #                 settings.GROQ_API_URL,
-# #                 headers=headers,
-# #                 json=payload,
-# #                 timeout=30
-# #             )
-# #             response.raise_for_status()
-# #             result_text = response.json()["choices"][0]["message"]["content"].strip()
-
-# #             # Clean the response
-# #             result_text = ProductAttributeService._clean_json_response(result_text)
-
-# #             # Parse JSON
-# #             parsed = json.loads(result_text)
-
-# #             # Validate and restructure with source tracking
-# #             parsed = ProductAttributeService._validate_response_structure(
-# #                 parsed, mandatory_attrs, extract_additional, source_map
-# #             )
-
-# #             # Clean up and add source tracking to additional attributes in array format
-# #             if extract_additional and "additional" in parsed:
-# #                 cleaned_additional = {}
-# #                 for k, v in parsed["additional"].items():
-# #                     if v and v not in ["Not Specified", "None", "N/A", "", "not specified", "none", "n/a"]:
-# #                         if not (isinstance(v, str) and v.lower() in ["not specified", "none", "n/a", ""]):
-# #                             # Convert to array format if not already
-# #                             if isinstance(v, list):
-# #                                 cleaned_additional[k] = []
-# #                                 for item in v:
-# #                                     if isinstance(item, dict) and "value" in item:
-# #                                         if "source" not in item:
-# #                                             item["source"] = ProductAttributeService.find_value_source(
-# #                                                 item["value"], source_map, k
-# #                                             )
-# #                                         cleaned_additional[k].append(item)
-# #                                     else:
-# #                                         source = ProductAttributeService.find_value_source(str(item), source_map, k)
-# #                                         cleaned_additional[k].append({"value": str(item), "source": source})
-# #                             else:
-# #                                 source = ProductAttributeService.find_value_source(str(v), source_map, k)
-# #                                 cleaned_additional[k] = [{"value": str(v), "source": source}]
-# #                 parsed["additional"] = cleaned_additional
-
-# #             # Calculate attribute relationships if using dynamic thresholds
-# #             relationships = {}
-# #             if use_dynamic_thresholds:
-# #                 relationships = ProductAttributeService.calculate_attribute_relationships(
-# #                     mandatory_attrs, product_text
-# #                 )
-
-# #             # Process attributes in order, allowing earlier ones to influence later ones
-# #             extracted_so_far = {}
-# #             for attr in mandatory_attrs.keys():
-# #                 allow_multiple = attr in multiple
-
-# #                 result = ProductAttributeService.normalize_against_product_text(
-# #                     product_text=product_text,
-# #                     mandatory_attrs={attr: mandatory_attrs[attr]},
-# #                     source_map=source_map,
-# #                     threshold_abs=threshold_abs,
-# #                     margin=margin,
-# #                     allow_multiple=allow_multiple,
-# #                     extracted_attrs=extracted_so_far,
-# #                     relationships=relationships,
-# #                     use_dynamic_thresholds=use_dynamic_thresholds,
-# #                     use_adaptive_margin=use_adaptive_margin,
-# #                     use_semantic_clustering=use_semantic_clustering
-# #                 )
-
-# #                 # Result is already in array format from normalize_against_product_text
-# #                 parsed["mandatory"][attr] = result[attr]
-# #                 extracted_so_far[attr] = result[attr]
-
-# #             return parsed
-
-# #         except requests.exceptions.RequestException as e:
-# #             return ProductAttributeService._create_error_response(
-# #                 str(e), mandatory_attrs, extract_additional
-# #             )
-# #         except json.JSONDecodeError as e:
-# #             return ProductAttributeService._create_error_response(
-# #                 f"Invalid JSON: {str(e)}", mandatory_attrs, extract_additional, result_text
-# #             )
-# #         except Exception as e:
-# #             return ProductAttributeService._create_error_response(
-# #                 str(e), mandatory_attrs, extract_additional
-# #             )
-
-# #     @staticmethod
-# #     def extract_attributes_batch(
-# #         products: List[Dict],
-# #         mandatory_attrs: Dict[str, List[str]],
-# #         model: str = None,
-# #         extract_additional: bool = True,
-# #         process_image: bool = True,
-# #         max_workers: int = 5,
-# #         multiple: Optional[List[str]] = None,
-# #         threshold_abs: float = 0.65,
-# #         margin: float = 0.15,
-# #         use_dynamic_thresholds: bool = True,
-# #         use_adaptive_margin: bool = True,
-# #         use_semantic_clustering: bool = True
-# #     ) -> Dict:
-# #         """Extract attributes for multiple products in parallel with enhanced multi-value selection and source tracking."""
-# #         results = []
-# #         successful = 0
-# #         failed = 0
-        
-# #         ocr_service = OCRService()
-
-# #         if multiple is None:
-# #             multiple = []
-
-# #         def process_product(product_data):
-# #             """Process a single product."""
-# #             product_id = product_data.get('product_id', f"product_{len(results)}")
-            
-# #             try:
-# #                 # Process image if URL is provided
-# #                 ocr_results = None
-# #                 ocr_text = None
-                
-# #                 if process_image and product_data.get('image_url'):
-# #                     ocr_results = ocr_service.process_image(product_data['image_url'])
-                    
-# #                     # Extract attributes from OCR
-# #                     if ocr_results and ocr_results.get('detected_text'):
-# #                         ocr_attrs = ProductAttributeService.extract_attributes_from_ocr(
-# #                             ocr_results, model
-# #                         )
-# #                         ocr_results['extracted_attributes'] = ocr_attrs
-                        
-# #                         # Format OCR text for combining with product text
-# #                         ocr_text = "\n".join([
-# #                             f"{item['text']} (confidence: {item['confidence']:.2f})"
-# #                             for item in ocr_results['detected_text']
-# #                         ])
-                
-# #                 # Combine all product information with source tracking
-# #                 product_text, source_map = ProductAttributeService.combine_product_text(
-# #                     title=product_data.get('title'),
-# #                     short_desc=product_data.get('short_desc'),
-# #                     long_desc=product_data.get('long_desc'),
-# #                     ocr_text=ocr_text
-# #                 )
-                
-# #                 # Extract attributes from combined text with enhanced features
-# #                 result = ProductAttributeService.extract_attributes(
-# #                     product_text=product_text,
-# #                     mandatory_attrs=mandatory_attrs,
-# #                     source_map=source_map,
-# #                     model=model,
-# #                     extract_additional=extract_additional,
-# #                     multiple=multiple,
-# #                     threshold_abs=threshold_abs,
-# #                     margin=margin,
-# #                     use_dynamic_thresholds=use_dynamic_thresholds,
-# #                     use_adaptive_margin=use_adaptive_margin,
-# #                     use_semantic_clustering=use_semantic_clustering
-# #                 )
-                
-# #                 result['product_id'] = product_id
-                
-# #                 # Add OCR results if available (already in correct format)
-# #                 if ocr_results:
-# #                     result['ocr_results'] = ocr_results
-                
-# #                 # Check if extraction was successful
-# #                 if 'error' not in result:
-# #                     return result, True
-# #                 else:
-# #                     return result, False
-                    
-# #             except Exception as e:
-# #                 return {
-# #                     'product_id': product_id,
-# #                     'mandatory': {attr: [{"value": "Not Specified", "source": "error"}] for attr in mandatory_attrs.keys()},
-# #                     'additional': {} if extract_additional else None,
-# #                     'error': f"Processing error: {str(e)}"
-# #                 }, False
-
-# #         # Process products in parallel
-# #         with ThreadPoolExecutor(max_workers=max_workers) as executor:
-# #             future_to_product = {
-# #                 executor.submit(process_product, product): product 
-# #                 for product in products
-# #             }
-            
-# #             for future in as_completed(future_to_product):
-# #                 try:
-# #                     result, success = future.result()
-# #                     results.append(result)
-# #                     if success:
-# #                         successful += 1
-# #                     else:
-# #                         failed += 1
-# #                 except Exception as e:
-# #                     failed += 1
-# #                     results.append({
-# #                         'product_id': 'unknown',
-# #                         'mandatory': {attr: [{"value": "Not Specified", "source": "error"}] for attr in mandatory_attrs.keys()},
-# #                         'additional': {} if extract_additional else None,
-# #                         'error': f"Unexpected error: {str(e)}"
-# #                     })
-
-# #         return {
-# #             'results': results,
-# #             'total_products': len(products),
-# #             'successful': successful,
-# #             'failed': failed
-# #         }
-
-# #     @staticmethod
-# #     def _clean_json_response(text: str) -> str:
-# #         """Clean LLM response to extract valid JSON."""
-# #         start_idx = text.find('{')
-# #         end_idx = text.rfind('}')
-
-# #         if start_idx != -1 and end_idx != -1:
-# #             text = text[start_idx:end_idx + 1]
-
-# #         if "```json" in text:
-# #             text = text.split("```json")[1].split("```")[0].strip()
-# #         elif "```" in text:
-# #             text = text.split("```")[1].split("```")[0].strip()
-# #             if text.startswith("json"):
-# #                 text = text[4:].strip()
-
-# #         return text
-
-# #     @staticmethod
-# #     def _validate_response_structure(
-# #         parsed: dict,
-# #         mandatory_attrs: Dict[str, List[str]],
-# #         extract_additional: bool,
-# #         source_map: Dict[str, str] = None
-# #     ) -> dict:
-# #         """Validate and fix the response structure, ensuring array format with source tracking."""
-# #         if source_map is None:
-# #             source_map = {}
-        
-# #         expected_sections = ["mandatory"]
-# #         if extract_additional:
-# #             expected_sections.append("additional")
-
-# #         if not all(section in parsed for section in expected_sections):
-# #             if isinstance(parsed, dict):
-# #                 mandatory_keys = set(mandatory_attrs.keys())
-# #                 mandatory = {k: v for k, v in parsed.items() if k in mandatory_keys}
-# #                 additional = {k: v for k, v in parsed.items() if k not in mandatory_keys}
-
-# #                 result = {"mandatory": mandatory}
-# #                 if extract_additional:
-# #                     result["additional"] = additional
-# #                 parsed = result
-# #             else:
-# #                 return ProductAttributeService._create_error_response(
-# #                     "Invalid response structure",
-# #                     mandatory_attrs,
-# #                     extract_additional,
-# #                     str(parsed)
-# #                 )
-
-# #         # Convert mandatory attributes to array format with source tracking
-# #         if "mandatory" in parsed:
-# #             converted_mandatory = {}
-# #             for attr, value in parsed["mandatory"].items():
-# #                 if isinstance(value, list):
-# #                     # Already in array format, ensure each item has source
-# #                     converted_mandatory[attr] = []
-# #                     for item in value:
-# #                         if isinstance(item, dict) and "value" in item:
-# #                             # Already has proper structure
-# #                             if "source" not in item:
-# #                                 item["source"] = ProductAttributeService.find_value_source(
-# #                                     item["value"], source_map, attr
-# #                                 )
-# #                             converted_mandatory[attr].append(item)
-# #                         else:
-# #                             # Convert string to proper format
-# #                             source = ProductAttributeService.find_value_source(str(item), source_map, attr)
-# #                             converted_mandatory[attr].append({"value": str(item), "source": source})
-# #                 else:
-# #                     # Single value - convert to array format
-# #                     source = ProductAttributeService.find_value_source(str(value), source_map, attr)
-# #                     converted_mandatory[attr] = [{"value": str(value), "source": source}]
-            
-# #             parsed["mandatory"] = converted_mandatory
-
-# #         return parsed
-
-# #     @staticmethod
-# #     def _create_error_response(
-# #         error: str,
-# #         mandatory_attrs: Dict[str, List[str]],
-# #         extract_additional: bool,
-# #         raw_output: Optional[str] = None
-# #     ) -> dict:
-# #         """Create a standardized error response in array format."""
-# #         response = {
-# #             "mandatory": {attr: [{"value": "Not Specified", "source": "error"}] for attr in mandatory_attrs.keys()},
-# #             "error": error
-# #         }
-# #         if extract_additional:
-# #             response["additional"] = {}
-# #         if raw_output:
-# #             response["raw_output"] = raw_output
-# #         return response
-
-
-
-
-
-
-
-
-
-
-# # ==================== services.py (OPTIMIZED) ====================
-# import requests
-# import json
-# import re
-# import hashlib
-# import logging
-# from typing import Dict, List, Optional, Tuple
-# from django.conf import settings
-# from concurrent.futures import ThreadPoolExecutor, as_completed
-# from sentence_transformers import SentenceTransformer, util
-# import numpy as np
-# from .ocr_service import OCRService
-
-# logger = logging.getLogger(__name__)
-
-# # Initialize embedding model for normalization (SINGLETON)
-# model_embedder = SentenceTransformer("all-MiniLM-L6-v2")
-
-
-# # ==================== CACHING CLASSES ====================
-
-# class SimpleCache:
-#     """
-#     In-memory cache for attribute extraction results.
-#     No Redis required - uses Python dict with automatic size management.
-#     """
-#     _cache = {}
-#     _max_size = 1000  # Maximum number of cached items
-    
-#     @classmethod
-#     def get(cls, key: str) -> Optional[Dict]:
-#         """Get cached value by key"""
-#         return cls._cache.get(key)
-    
-#     @classmethod
-#     def set(cls, key: str, value: Dict):
-#         """Set cache value with automatic LRU cleanup"""
-#         # Simple LRU: clear oldest 20% if cache is full
-#         if len(cls._cache) >= cls._max_size:
-#             items = list(cls._cache.items())
-#             # Keep newest 80%
-#             cls._cache = dict(items[int(cls._max_size * 0.2):])
-#             logger.info(f"Cache cleaned: kept {len(cls._cache)} items")
-        
-#         cls._cache[key] = value
-    
-#     @classmethod
-#     def clear(cls):
-#         """Clear entire cache"""
-#         cls._cache.clear()
-#         logger.info("Cache cleared")
-    
-#     @classmethod
-#     def get_stats(cls) -> Dict:
-#         """Get cache statistics"""
-#         return {
-#             "size": len(cls._cache),
-#             "max_size": cls._max_size,
-#             "usage_percent": round(len(cls._cache) / cls._max_size * 100, 2)
-#         }
-
-
-# class EmbeddingCache:
-#     """
-#     Cache for sentence transformer embeddings.
-#     Significantly reduces embedding computation time.
-#     """
-#     _cache = {}
-#     _max_size = 500
-#     _hit_count = 0
-#     _miss_count = 0
-    
-#     @classmethod
-#     def get_embedding(cls, text: str, model):
-#         """Get or compute embedding with caching"""
-#         if text in cls._cache:
-#             cls._hit_count += 1
-#             return cls._cache[text]
-        
-#         # Cache miss - compute embedding
-#         cls._miss_count += 1
-        
-#         # Auto-cleanup if cache is full
-#         if len(cls._cache) >= cls._max_size:
-#             items = list(cls._cache.items())
-#             cls._cache = dict(items[int(cls._max_size * 0.3):])
-#             logger.info(f"Embedding cache cleaned: kept {len(cls._cache)} items")
-        
-#         # Compute and cache
-#         embedding = model.encode(text, convert_to_tensor=True)
-#         cls._cache[text] = embedding
-#         return embedding
-    
-#     @classmethod
-#     def clear(cls):
-#         """Clear embedding cache"""
-#         cls._cache.clear()
-#         cls._hit_count = 0
-#         cls._miss_count = 0
-#         logger.info("Embedding cache cleared")
-    
-#     @classmethod
-#     def get_stats(cls) -> Dict:
-#         """Get cache statistics"""
-#         total = cls._hit_count + cls._miss_count
-#         hit_rate = (cls._hit_count / total * 100) if total > 0 else 0
-#         return {
-#             "size": len(cls._cache),
-#             "max_size": cls._max_size,
-#             "hits": cls._hit_count,
-#             "misses": cls._miss_count,
-#             "hit_rate_percent": round(hit_rate, 2)
-#         }
-
-
-# # ==================== MAIN SERVICE CLASS ====================
-
-# class ProductAttributeService:
-#     """Service class for extracting product attributes using Groq LLM."""
-
-#     @staticmethod
-#     def _generate_cache_key(product_text: str, mandatory_attrs: Dict) -> str:
-#         """
-#         Generate a unique cache key from product text and attributes.
-#         Uses MD5 hash for consistent short keys.
-#         """
-#         # Sort attributes for consistent hashing
-#         attrs_str = json.dumps(mandatory_attrs, sort_keys=True)
-#         content = f"{product_text}:{attrs_str}"
-#         return f"attr_{hashlib.md5(content.encode()).hexdigest()}"
-
-#     @staticmethod
-#     def normalize_dimension_text(text: str) -> str:
-#         """
-#         Normalize dimension text to match format like '16x20', '20x30', etc.
-#         Handles formats like '16 x 20', '16x1.5x20', '16 x 1.5 x 20 Inches'
-#         Returns the normalized dimension (e.g., '16x20') or empty string if not found.
-#         """
-#         if not text:
-#             return ""
-        
-#         # Convert to lowercase and remove common units
-#         text = text.lower()
-#         text = re.sub(r'\s*(inches|inch|in|cm|centimeters|mm|millimeters)\s*', '', text, flags=re.IGNORECASE)
-        
-#         # Extract all numbers from the text
-#         numbers = re.findall(r'\d+\.?\d*', text)
-        
-#         if not numbers:
-#             return ""
-        
-#         # Convert to floats first to handle decimals properly
-#         float_numbers = []
-#         for num in numbers:
-#             try:
-#                 float_numbers.append(float(num))
-#             except:
-#                 continue
-        
-#         if len(float_numbers) < 2:
-#             return ""
-        
-#         # If we have 3 dimensions, it's likely Width x Depth x Height
-#         # For wall art, depth is usually small (< 5), so we keep first and last
-#         if len(float_numbers) == 3:
-#             # Keep first and last values (width and height), skip middle (depth)
-#             float_numbers = [float_numbers[0], float_numbers[2]]
-#         elif len(float_numbers) > 3:
-#             # If more than 3 dimensions, keep the two largest
-#             float_numbers = sorted(float_numbers)[-2:]
-#         else:
-#             # Just 2 dimensions, use as is
-#             float_numbers = float_numbers[:2]
-        
-#         # Format numbers: use integer if whole, else one decimal
-#         formatted_numbers = []
-#         for num in float_numbers:
-#             if num.is_integer():
-#                 formatted_numbers.append(str(int(num)))
-#             else:
-#                 formatted_numbers.append(f"{num:.1f}")
-        
-#         # Sort to ensure consistent order (smaller x larger)
-#         formatted_numbers.sort(key=lambda x: float(x))
-        
-#         # Return formatted dimension
-#         return f"{formatted_numbers[0]}x{formatted_numbers[1]}"
-    
-#     @staticmethod
-#     def normalize_value_for_matching(value: str, attr_name: str = "") -> str:
-#         """
-#         Normalize a value based on its attribute type for better matching.
-#         Currently handles dimensions specially, can be extended for other attributes.
-#         """
-#         # Check if this is a dimension-related attribute
-#         dimension_keywords = ['dimension', 'size', 'measurement']
-#         if any(keyword in attr_name.lower() for keyword in dimension_keywords):
-#             normalized = ProductAttributeService.normalize_dimension_text(value)
-#             if normalized:
-#                 return normalized
-        
-#         # For other attributes, just return cleaned value
-#         return value.strip()
-
-#     @staticmethod
-#     def combine_product_text(
-#         title: Optional[str] = None,
-#         short_desc: Optional[str] = None,
-#         long_desc: Optional[str] = None,
-#         ocr_text: Optional[str] = None
-#     ) -> Tuple[str, Dict[str, str]]:
-#         """
-#         Combine product metadata into a single text block.
-#         Returns: (combined_text, source_map) where source_map tracks which text came from where
-#         """
-#         parts = []
-#         source_map = {}
-        
-#         if title:
-#             title_str = str(title).strip()
-#             parts.append(f"Title: {title_str}")
-#             source_map['title'] = title_str
-#         if short_desc:
-#             short_str = str(short_desc).strip()
-#             parts.append(f"Description: {short_str}")
-#             source_map['short_desc'] = short_str
-#         if long_desc:
-#             long_str = str(long_desc).strip()
-#             parts.append(f"Details: {long_str}")
-#             source_map['long_desc'] = long_str
-#         if ocr_text:
-#             parts.append(f"OCR Text: {ocr_text}")
-#             source_map['ocr_text'] = ocr_text
-        
-#         combined = "\n".join(parts).strip()
-        
-#         if not combined:
-#             return "No product information available", {}
-        
-#         return combined, source_map
-
-#     @staticmethod
-#     def find_value_source(value: str, source_map: Dict[str, str], attr_name: str = "") -> str:
-#         """
-#         Find which source(s) contain the given value.
-#         Returns the source name(s) where the value appears.
-#         Now handles normalized matching for dimensions.
-#         """
-#         value_lower = value.lower()
-#         # Split value into tokens for better matching
-#         value_tokens = set(value_lower.replace("-", " ").replace("x", " ").split())
-        
-#         # Check if this is a dimension-related attribute
-#         is_dimension_attr = any(keyword in attr_name.lower() for keyword in ['dimension', 'size', 'measurement'])
-        
-#         sources_found = []
-#         source_scores = {}
-        
-#         for source_name, source_text in source_map.items():
-#             source_lower = source_text.lower()
-            
-#             # Check for exact phrase match first
-#             if value_lower in source_lower:
-#                 source_scores[source_name] = 1.0
-#                 continue
-            
-#             # For dimensions, check normalized match
-#             if is_dimension_attr:
-#                 # Normalize the value (e.g., "16x20" stays "16x20")
-#                 normalized_value = ProductAttributeService.normalize_dimension_text(value)
-#                 if not normalized_value:
-#                     normalized_value = value.replace("x", " ").strip()
-                
-#                 # Normalize the source text to extract dimensions
-#                 normalized_source = ProductAttributeService.normalize_dimension_text(source_text)
-                
-#                 # Direct match
-#                 if normalized_value == normalized_source:
-#                     source_scores[source_name] = 0.95
-#                     continue
-                
-#                 # Also check if the dimension numbers appear in the source
-#                 # Extract dimension parts (e.g., "16x20" -> ["16", "20"])
-#                 dim_parts = normalized_value.split("x") if "x" in normalized_value else []
-#                 if len(dim_parts) == 2:
-#                     # Check if both numbers appear in the source
-#                     if all(part in source_text for part in dim_parts):
-#                         source_scores[source_name] = 0.85
-#                         continue
-            
-#             # Check for token matches
-#             token_matches = sum(1 for token in value_tokens if token and token in source_lower)
-#             if token_matches > 0 and len(value_tokens) > 0:
-#                 source_scores[source_name] = token_matches / len(value_tokens)
-        
-#         # Return source with highest score, or all sources if multiple have same score
-#         if source_scores:
-#             max_score = max(source_scores.values())
-#             sources_found = [s for s, score in source_scores.items() if score == max_score]
-            
-#             # Prioritize: title > short_desc > long_desc > ocr_text
-#             priority = ['title', 'short_desc', 'long_desc', 'ocr_text']
-#             for p in priority:
-#                 if p in sources_found:
-#                     return p
-            
-#             return sources_found[0] if sources_found else "Not found"
-        
-#         return "Not found"
-
-#     @staticmethod
-#     def format_visual_attributes(visual_attributes: Dict) -> Dict:
-#         """
-#         Convert visual attributes to array format with source tracking.
-#         Source is always 'image' for visual attributes.
-#         """
-#         formatted = {}
-        
-#         for key, value in visual_attributes.items():
-#             if isinstance(value, list):
-#                 # Already a list (like color_palette)
-#                 formatted[key] = [{"value": str(item), "source": "image"} for item in value]
-#             elif isinstance(value, dict):
-#                 # Nested dictionary - format recursively
-#                 nested_formatted = {}
-#                 for nested_key, nested_value in value.items():
-#                     if isinstance(nested_value, list):
-#                         nested_formatted[nested_key] = [{"value": str(item), "source": "image"} for item in nested_value]
-#                     else:
-#                         nested_formatted[nested_key] = [{"value": str(nested_value), "source": "image"}]
-#                 formatted[key] = nested_formatted
-#             else:
-#                 # Single value
-#                 formatted[key] = [{"value": str(value), "source": "image"}]
-        
-#         return formatted
-
-#     @staticmethod
-#     def extract_attributes_from_ocr(ocr_results: Dict, model: str = None) -> Dict:
-#         """Extract structured attributes from OCR text using LLM."""
-#         if model is None:
-#             model = settings.SUPPORTED_MODELS[0]
-        
-#         detected_text = ocr_results.get('detected_text', [])
-#         if not detected_text:
-#             return {}
-        
-#         # Format OCR text for prompt
-#         ocr_text = "\n".join([f"Text: {item['text']}, Confidence: {item['confidence']:.2f}" 
-#                               for item in detected_text])
-        
-#         prompt = f"""
-# You are an AI model that extracts structured attributes from OCR text detected on product images.
-# Given the OCR detections below, infer the possible product attributes and return them as a clean JSON object.
-
-# OCR Text:
-# {ocr_text}
-
-# Extract relevant attributes like:
-# - brand
-# - model_number
-# - size (waist_size, length, etc.)
-# - collection
-# - any other relevant product information
-
-# Return a JSON object with only the attributes you can confidently identify.
-# If an attribute is not present, do not include it in the response.
-# """
-        
-#         payload = {
-#             "model": model,
-#             "messages": [
-#                 {
-#                     "role": "system",
-#                     "content": "You are a helpful AI that extracts structured data from OCR output. Return only valid JSON."
-#                 },
-#                 {"role": "user", "content": prompt}
-#             ],
-#             "temperature": 0.2,
-#             "max_tokens": 500
-#         }
-        
-#         headers = {
-#             "Authorization": f"Bearer {settings.GROQ_API_KEY}",
-#             "Content-Type": "application/json",
-#         }
-        
-#         try:
-#             response = requests.post(
-#                 settings.GROQ_API_URL,
-#                 headers=headers,
-#                 json=payload,
-#                 timeout=30
-#             )
-#             response.raise_for_status()
-#             result_text = response.json()["choices"][0]["message"]["content"].strip()
-            
-#             # Clean and parse JSON
-#             result_text = ProductAttributeService._clean_json_response(result_text)
-#             parsed = json.loads(result_text)
-            
-#             # Convert to array format with source tracking
-#             formatted_attributes = {}
-#             for key, value in parsed.items():
-#                 if key == "error":
-#                     continue
-                
-#                 # Handle nested dictionaries (like size)
-#                 if isinstance(value, dict):
-#                     nested_formatted = {}
-#                     for nested_key, nested_value in value.items():
-#                         nested_formatted[nested_key] = [{"value": str(nested_value), "source": "image"}]
-#                     formatted_attributes[key] = nested_formatted
-#                 elif isinstance(value, list):
-#                     # Already a list, convert each item
-#                     formatted_attributes[key] = [{"value": str(item), "source": "image"} for item in value]
-#                 else:
-#                     # Single value
-#                     formatted_attributes[key] = [{"value": str(value), "source": "image"}]
-            
-#             return formatted_attributes
-#         except Exception as e:
-#             logger.error(f"OCR attribute extraction failed: {str(e)}")
-#             return {"error": f"Failed to extract attributes from OCR: {str(e)}"}
-
-#     @staticmethod
-#     def calculate_attribute_relationships(
-#         mandatory_attrs: Dict[str, List[str]],
-#         product_text: str
-#     ) -> Dict[str, float]:
-#         """
-#         Calculate semantic relationships between attribute values across different attributes.
-#         Returns a matrix of cross-attribute value similarities.
-#         """
-#         # USE EMBEDDING CACHE
-#         pt_emb = EmbeddingCache.get_embedding(product_text, model_embedder)
-
-#         # Calculate similarities between all attribute values and product text
-#         attr_scores = {}
-#         for attr, values in mandatory_attrs.items():
-#             attr_scores[attr] = {}
-#             for val in values:
-#                 contexts = [val, f"for {val}", f"use in {val}", f"suitable for {val}"]
-#                 # USE EMBEDDING CACHE FOR CONTEXTS
-#                 ctx_embs = [EmbeddingCache.get_embedding(c, model_embedder) for c in contexts]
-#                 sem_sim = max(float(util.cos_sim(pt_emb, ce).item()) for ce in ctx_embs)
-#                 attr_scores[attr][val] = sem_sim
-
-#         # Calculate cross-attribute value relationships
-#         relationships = {}
-#         attr_list = list(mandatory_attrs.keys())
-
-#         for i, attr1 in enumerate(attr_list):
-#             for attr2 in attr_list[i+1:]:
-#                 # Calculate pairwise similarities between values of different attributes
-#                 for val1 in mandatory_attrs[attr1]:
-#                     for val2 in mandatory_attrs[attr2]:
-#                         # USE EMBEDDING CACHE
-#                         emb1 = EmbeddingCache.get_embedding(val1, model_embedder)
-#                         emb2 = EmbeddingCache.get_embedding(val2, model_embedder)
-#                         sim = float(util.cos_sim(emb1, emb2).item())
-
-#                         # Store bidirectional relationships
-#                         key1 = f"{attr1}:{val1}->{attr2}:{val2}"
-#                         key2 = f"{attr2}:{val2}->{attr1}:{val1}"
-#                         relationships[key1] = sim
-#                         relationships[key2] = sim
-
-#         return relationships
-
-#     @staticmethod
-#     def calculate_value_clusters(
-#         values: List[str],
-#         scores: List[Tuple[str, float]],
-#         cluster_threshold: float = 0.4
-#     ) -> List[List[str]]:
-#         """
-#         Group values into semantic clusters based on their similarity to each other.
-#         Returns clusters of related values.
-#         """
-#         if len(values) <= 1:
-#             return [[val] for val, _ in scores]
-
-#         # Get embeddings for all values - USE CACHE
-#         embeddings = [EmbeddingCache.get_embedding(val, model_embedder) for val in values]
-
-#         # Calculate pairwise similarities
-#         similarity_matrix = np.zeros((len(values), len(values)))
-#         for i in range(len(values)):
-#             for j in range(i+1, len(values)):
-#                 sim = float(util.cos_sim(embeddings[i], embeddings[j]).item())
-#                 similarity_matrix[i][j] = sim
-#                 similarity_matrix[j][i] = sim
-
-#         # Simple clustering: group values with high similarity
-#         clusters = []
-#         visited = set()
-
-#         for i, (val, score) in enumerate(scores):
-#             if i in visited:
-#                 continue
-
-#             cluster = [val]
-#             visited.add(i)
-
-#             # Find similar values
-#             for j in range(len(values)):
-#                 if j not in visited and similarity_matrix[i][j] >= cluster_threshold:
-#                     cluster.append(values[j])
-#                     visited.add(j)
-
-#             clusters.append(cluster)
-
-#         return clusters
-
-#     @staticmethod
-#     def get_dynamic_threshold(
-#         attr: str,
-#         val: str,
-#         base_score: float,
-#         extracted_attrs: Dict[str, List[Dict[str, str]]],
-#         relationships: Dict[str, float],
-#         mandatory_attrs: Dict[str, List[str]],
-#         base_threshold: float = 0.65,
-#         boost_factor: float = 0.15
-#     ) -> float:
-#         """
-#         Calculate dynamic threshold based on relationships with already-extracted attributes.
-#         """
-#         threshold = base_threshold
-
-#         # Check relationships with already extracted attributes
-#         max_relationship = 0.0
-#         for other_attr, other_values_list in extracted_attrs.items():
-#             if other_attr == attr:
-#                 continue
-
-#             for other_val_dict in other_values_list:
-#                 other_val = other_val_dict['value']
-#                 key = f"{attr}:{val}->{other_attr}:{other_val}"
-#                 if key in relationships:
-#                     max_relationship = max(max_relationship, relationships[key])
-
-#         # If strong relationship exists, lower threshold
-#         if max_relationship > 0.6:
-#             threshold = base_threshold - (boost_factor * max_relationship)
-
-#         return max(0.3, threshold)
-
-#     @staticmethod
-#     def get_adaptive_margin(
-#         scores: List[Tuple[str, float]],
-#         base_margin: float = 0.15,
-#         max_margin: float = 0.22
-#     ) -> float:
-#         """
-#         Calculate adaptive margin based on score distribution.
-#         """
-#         if len(scores) < 2:
-#             return base_margin
-
-#         score_values = [s for _, s in scores]
-#         best_score = score_values[0]
-
-#         # If best score is very low, use adaptive margin but be more conservative
-#         if best_score < 0.5:
-#             # Calculate score spread in top 3-4 scores only (more selective)
-#             top_scores = score_values[:min(4, len(score_values))]
-#             score_range = max(top_scores) - min(top_scores)
-
-#             # Very controlled margin increase
-#             if score_range < 0.30:
-#                 # Much more conservative scaling
-#                 score_factor = (0.5 - best_score) * 0.35
-#                 adaptive = base_margin + score_factor + (0.30 - score_range) * 0.2
-#                 return min(adaptive, max_margin)
-
-#         return base_margin
-
-#     @staticmethod
-#     def _lexical_evidence(product_text: str, label: str) -> float:
-#         """Calculate lexical overlap between product text and label."""
-#         pt = product_text.lower()
-#         tokens = [t for t in label.lower().replace("-", " ").split() if t]
-#         if not tokens:
-#             return 0.0
-#         hits = sum(1 for t in tokens if t in pt)
-#         return hits / len(tokens)
-
-#     @staticmethod
-#     def normalize_against_product_text(
-#         product_text: str,
-#         mandatory_attrs: Dict[str, List[str]],
-#         source_map: Dict[str, str],
-#         threshold_abs: float = 0.65,
-#         margin: float = 0.15,
-#         allow_multiple: bool = False,
-#         sem_weight: float = 0.8,
-#         lex_weight: float = 0.2,
-#         extracted_attrs: Optional[Dict[str, List[Dict[str, str]]]] = None,
-#         relationships: Optional[Dict[str, float]] = None,
-#         use_dynamic_thresholds: bool = True,
-#         use_adaptive_margin: bool = True,
-#         use_semantic_clustering: bool = True
-#     ) -> dict:
-#         """
-#         Score each allowed value against the product_text with dynamic thresholds.
-#         Returns dict with values in array format: [{"value": "...", "source": "..."}]
-        
-#         ⚡ OPTIMIZED: Uses EmbeddingCache for faster computation
-#         """
-#         if extracted_attrs is None:
-#             extracted_attrs = {}
-#         if relationships is None:
-#             relationships = {}
-
-#         # USE EMBEDDING CACHE - CRITICAL OPTIMIZATION
-#         pt_emb = EmbeddingCache.get_embedding(product_text, model_embedder)
-#         extracted = {}
-
-#         for attr, allowed_values in mandatory_attrs.items():
-#             scores: List[Tuple[str, float]] = []
-            
-#             # Check if this is a dimension attribute
-#             is_dimension_attr = any(keyword in attr.lower() for keyword in ['dimension', 'size', 'measurement'])
-            
-#             # Normalize product text once for dimension matching
-#             normalized_product_text = ProductAttributeService.normalize_dimension_text(product_text) if is_dimension_attr else ""
-
-#             for val in allowed_values:
-#                 # For dimension attributes, try exact normalized matching first
-#                 if is_dimension_attr:
-#                     # Normalize the allowed value from the list
-#                     normalized_val = ProductAttributeService.normalize_dimension_text(val)
-                    
-#                     # If we have both normalized values and they match exactly, give highest score
-#                     if normalized_val and normalized_product_text and normalized_val == normalized_product_text:
-#                         scores.append((val, 1.0))
-#                         continue
-                    
-#                     # Also check if the normalized value appears in the original product text
-#                     if normalized_val:
-#                         val_numbers = normalized_val.split('x')
-#                         text_lower = product_text.lower()
-#                         if all(num in text_lower for num in val_numbers):
-#                             idx1 = text_lower.find(val_numbers[0])
-#                             idx2 = text_lower.find(val_numbers[1])
-#                             if idx1 != -1 and idx2 != -1:
-#                                 distance = abs(idx2 - idx1)
-#                                 if distance < 20:
-#                                     scores.append((val, 0.95))
-#                                     continue
-                
-#                 # Standard semantic matching - USE EMBEDDING CACHE
-#                 contexts = [val, f"for {val}", f"use in {val}", f"suitable for {val}", f"{val} room"]
-#                 ctx_embs = [EmbeddingCache.get_embedding(c, model_embedder) for c in contexts]
-#                 sem_sim = max(float(util.cos_sim(pt_emb, ce).item()) for ce in ctx_embs)
-
-#                 lex_score = ProductAttributeService._lexical_evidence(product_text, val)
-#                 final_score = sem_weight * sem_sim + lex_weight * lex_score
-#                 scores.append((val, final_score))
-
-#             scores.sort(key=lambda x: x[1], reverse=True)
-#             best_val, best_score = scores[0]
-
-#             # Calculate adaptive margin if enabled
-#             effective_margin = margin
-#             if allow_multiple and use_adaptive_margin:
-#                 effective_margin = ProductAttributeService.get_adaptive_margin(scores, margin)
-
-#             # Special handling for dimension attributes with exact matches
-#             if is_dimension_attr and best_score >= 0.90:
-#                 source = ProductAttributeService.find_value_source(best_val, source_map, attr)
-#                 extracted[attr] = [{"value": best_val, "source": source}]
-#                 continue
-
-#             if not allow_multiple:
-#                 source = ProductAttributeService.find_value_source(best_val, source_map, attr)
-#                 extracted[attr] = [{"value": best_val, "source": source}]
-#             else:
-#                 candidates = [best_val]
-#                 use_base_threshold = best_score >= threshold_abs
-
-#                 # Get semantic clusters if enabled
-#                 clusters = []
-#                 if use_semantic_clustering:
-#                     clusters = ProductAttributeService.calculate_value_clusters(
-#                         allowed_values, scores, cluster_threshold=0.4
-#                     )
-#                     best_cluster = next((c for c in clusters if best_val in c), [best_val])
-
-#                 for val, sc in scores[1:]:
-#                     min_score = 0.4 if is_dimension_attr else 0.3
-#                     if sc < min_score:
-#                         continue
-                    
-#                     if use_dynamic_thresholds and extracted_attrs:
-#                         dynamic_thresh = ProductAttributeService.get_dynamic_threshold(
-#                             attr, val, sc, extracted_attrs, relationships,
-#                             mandatory_attrs, threshold_abs
-#                         )
-#                     else:
-#                         dynamic_thresh = threshold_abs
-
-#                     within_margin = (best_score - sc) <= effective_margin
-#                     above_threshold = sc >= dynamic_thresh
-
-#                     in_cluster = False
-#                     if use_semantic_clustering and clusters:
-#                         in_cluster = any(best_val in c and val in c for c in clusters)
-
-#                     if use_base_threshold:
-#                         if above_threshold and within_margin:
-#                             candidates.append(val)
-#                         elif in_cluster and within_margin:
-#                             candidates.append(val)
-#                     else:
-#                         if within_margin:
-#                             candidates.append(val)
-#                         elif in_cluster and (best_score - sc) <= effective_margin * 2.0:
-#                             candidates.append(val)
-
-#                 extracted[attr] = []
-#                 for candidate in candidates:
-#                     source = ProductAttributeService.find_value_source(candidate, source_map, attr)
-#                     extracted[attr].append({"value": candidate, "source": source})
-
-#         return extracted
-
-#     @staticmethod
-#     def extract_attributes(
-#         product_text: str,
-#         mandatory_attrs: Dict[str, List[str]],
-#         source_map: Dict[str, str] = None,
-#         model: str = None,
-#         extract_additional: bool = True,
-#         multiple: Optional[List[str]] = None,
-#         threshold_abs: float = 0.65,
-#         margin: float = 0.15,
-#         use_dynamic_thresholds: bool = True,
-#         use_adaptive_margin: bool = True,
-#         use_semantic_clustering: bool = True,
-#         use_cache: bool = True  # ⚡ NEW: Enable/disable caching
-#     ) -> dict:
-#         """
-#         Use Groq LLM to extract attributes from any product type with enhanced multi-value selection.
-#         Now returns values in array format: [{"value": "...", "source": "..."}]
-        
-#         ⚡ OPTIMIZED: Added caching layer for faster repeated requests
-#         """
-        
-#         if model is None:
-#             model = settings.SUPPORTED_MODELS[0]
-
-#         if multiple is None:
-#             multiple = []
-
-#         if source_map is None:
-#             source_map = {}
-
-#         # Check if product text is empty or minimal
-#         if not product_text or product_text == "No product information available":
-#             return ProductAttributeService._create_error_response(
-#                 "No product information provided",
-#                 mandatory_attrs,
-#                 extract_additional
-#             )
-
-#         # ⚡ CHECK CACHE FIRST
-#         if use_cache:
-#             cache_key = ProductAttributeService._generate_cache_key(product_text, mandatory_attrs)
-#             cached_result = SimpleCache.get(cache_key)
-#             if cached_result:
-#                 logger.info(f"✓ Cache hit - returning cached result")
-#                 return cached_result
-
-#         # Create structured prompt for mandatory attributes
-#         mandatory_attr_list = []
-#         for attr_name, allowed_values in mandatory_attrs.items():
-#             mandatory_attr_list.append(f"{attr_name}: {', '.join(allowed_values)}")
-#         mandatory_attr_text = "\n".join(mandatory_attr_list)
-
-#         additional_instruction = ""
-#         if extract_additional:
-#             additional_instruction = """
-# 2. Extract ADDITIONAL attributes: Identify any other relevant attributes from the product text 
-#    that are NOT in the mandatory list. Only include attributes where you can find actual values
-#    in the product text. Do NOT include attributes with "Not Specified" or empty values.
-   
-#    Examples of attributes to look for (only if present): Brand, Material, Size, Color, Dimensions,
-#    Weight, Features, Style, Theme, Pattern, Finish, Care Instructions, etc."""
-
-#         output_format = {
-#             "mandatory": {attr: "value or list of values" for attr in mandatory_attrs.keys()},
-#         }
-
-#         if extract_additional:
-#             output_format["additional"] = {
-#                 "example_attribute_1": "actual value found",
-#                 "example_attribute_2": "actual value found"
-#             }
-#             output_format["additional"]["_note"] = "Only include attributes with actual values found in text"
-
-#         prompt = f"""
-# You are an intelligent product attribute extractor that works with ANY product type.
-
-# TASK:
-# 1. Extract MANDATORY attributes: For each mandatory attribute, select the most appropriate value(s)
-#    from the provided list. Choose the value(s) that best match the product description.
-# {additional_instruction}
-
-# Product Text:
-# {product_text}
-
-# Mandatory Attribute Lists (MUST select from these allowed values):
-# {mandatory_attr_text}
-
-# CRITICAL INSTRUCTIONS:
-# - Return ONLY valid JSON, nothing else
-# - No explanations, no markdown, no text before or after the JSON
-# - For mandatory attributes, choose the value(s) from the provided list that best match
-# - If a mandatory attribute cannot be determined from the product text, use "Not Specified"
-# - Prefer exact matches from the allowed values list over generic synonyms
-# - If multiple values are plausible, you MAY return more than one
-# {f"- For additional attributes: ONLY include attributes where you found actual values in the product text. DO NOT include attributes with 'Not Specified', 'None', 'N/A', or empty values. If you cannot find a value for an attribute, simply don't include that attribute." if extract_additional else ""}
-# - Be precise and only extract information that is explicitly stated or clearly implied
-
-# Required Output Format:
-# {json.dumps(output_format, indent=2)}
-#         """
-
-#         payload = {
-#             "model": model,
-#             "messages": [
-#                 {
-#                     "role": "system",
-#                     "content": f"You are a precise attribute extraction model. Return ONLY valid JSON with {'mandatory and additional' if extract_additional else 'mandatory'} sections. No explanations, no markdown, no other text."
-#                 },
-#                 {"role": "user", "content": prompt}
-#             ],
-#             "temperature": 0.0,
-#             "max_tokens": 1500
-#         }
-
-#         headers = {
-#             "Authorization": f"Bearer {settings.GROQ_API_KEY}",
-#             "Content-Type": "application/json",
-#         }
-
-#         try:
-#             response = requests.post(
-#                 settings.GROQ_API_URL,
-#                 headers=headers,
-#                 json=payload,
-#                 timeout=30
-#             )
-#             response.raise_for_status()
-#             result_text = response.json()["choices"][0]["message"]["content"].strip()
-
-#             # Clean the response
-#             result_text = ProductAttributeService._clean_json_response(result_text)
-
-#             # Parse JSON
-#             parsed = json.loads(result_text)
-
-#             # Validate and restructure with source tracking
-#             parsed = ProductAttributeService._validate_response_structure(
-#                 parsed, mandatory_attrs, extract_additional, source_map
-#             )
-
-#             # Clean up and add source tracking to additional attributes in array format
-#             if extract_additional and "additional" in parsed:
-#                 cleaned_additional = {}
-#                 for k, v in parsed["additional"].items():
-#                     if v and v not in ["Not Specified", "None", "N/A", "", "not specified", "none", "n/a"]:
-#                         if not (isinstance(v, str) and v.lower() in ["not specified", "none", "n/a", ""]):
-#                             # Convert to array format if not already
-#                             if isinstance(v, list):
-#                                 cleaned_additional[k] = []
-#                                 for item in v:
-#                                     if isinstance(item, dict) and "value" in item:
-#                                         if "source" not in item:
-#                                             item["source"] = ProductAttributeService.find_value_source(
-#                                                 item["value"], source_map, k
-#                                             )
-#                                         cleaned_additional[k].append(item)
-#                                     else:
-#                                         source = ProductAttributeService.find_value_source(str(item), source_map, k)
-#                                         cleaned_additional[k].append({"value": str(item), "source": source})
-#                             else:
-#                                 source = ProductAttributeService.find_value_source(str(v), source_map, k)
-#                                 cleaned_additional[k] = [{"value": str(v), "source": source}]
-#                 parsed["additional"] = cleaned_additional
-
-#             # Calculate attribute relationships if using dynamic thresholds
-#             relationships = {}
-#             if use_dynamic_thresholds:
-#                 relationships = ProductAttributeService.calculate_attribute_relationships(
-#                     mandatory_attrs, product_text
-#                 )
-
-#             # Process attributes in order, allowing earlier ones to influence later ones
-#             extracted_so_far = {}
-#             for attr in mandatory_attrs.keys():
-#                 allow_multiple = attr in multiple
-
-#                 result = ProductAttributeService.normalize_against_product_text(
-#                     product_text=product_text,
-#                     mandatory_attrs={attr: mandatory_attrs[attr]},
-#                     source_map=source_map,
-#                     threshold_abs=threshold_abs,
-#                     margin=margin,
-#                     allow_multiple=allow_multiple,
-#                     extracted_attrs=extracted_so_far,
-#                     relationships=relationships,
-#                     use_dynamic_thresholds=use_dynamic_thresholds,
-#                     use_adaptive_margin=use_adaptive_margin,
-#                     use_semantic_clustering=use_semantic_clustering
-#                 )
-
-#                 # Result is already in array format from normalize_against_product_text
-#                 parsed["mandatory"][attr] = result[attr]
-#                 extracted_so_far[attr] = result[attr]
-
-#             # ⚡ CACHE THE RESULT
-#             if use_cache:
-#                 SimpleCache.set(cache_key, parsed)
-#                 logger.info(f"✓ Cached extraction result")
-
-#             return parsed
-
-#         except requests.exceptions.RequestException as e:
-#             logger.error(f"Request exception: {str(e)}")
-#             return ProductAttributeService._create_error_response(
-#                 str(e), mandatory_attrs, extract_additional
-#             )
-#         except json.JSONDecodeError as e:
-#             logger.error(f"JSON decode error: {str(e)}")
-#             return ProductAttributeService._create_error_response(
-#                 f"Invalid JSON: {str(e)}", mandatory_attrs, extract_additional, result_text
-#             )
-#         except Exception as e:
-#             logger.error(f"Unexpected error: {str(e)}")
-#             return ProductAttributeService._create_error_response(
-#                 str(e), mandatory_attrs, extract_additional
-#             )
-
-#     @staticmethod
-#     def extract_attributes_batch(
-#         products: List[Dict],
-#         mandatory_attrs: Dict[str, List[str]],
-#         model: str = None,
-#         extract_additional: bool = True,
-#         process_image: bool = True,
-#         max_workers: int = 5,
-#         multiple: Optional[List[str]] = None,
-#         threshold_abs: float = 0.65,
-#         margin: float = 0.15,
-#         use_dynamic_thresholds: bool = True,
-#         use_adaptive_margin: bool = True,
-#         use_semantic_clustering: bool = True,
-#         use_cache: bool = True  # ⚡ NEW: Enable caching for batch processing
-#     ) -> Dict:
-#         """
-#         Extract attributes for multiple products in parallel with enhanced multi-value selection and source tracking.
-#         ⚡ OPTIMIZED: Added caching support for batch operations
-#         """
-#         results = []
-#         successful = 0
-#         failed = 0
-        
-#         ocr_service = OCRService()
-
-#         if multiple is None:
-#             multiple = []
-
-#         def process_product(product_data):
-#             """Process a single product."""
-#             product_id = product_data.get('product_id', f"product_{len(results)}")
-            
-#             try:
-#                 # Process image if URL is provided
-#                 ocr_results = None
-#                 ocr_text = None
-                
-#                 if process_image and product_data.get('image_url'):
-#                     ocr_results = ocr_service.process_image(product_data['image_url'])
-                    
-#                     # Extract attributes from OCR
-#                     if ocr_results and ocr_results.get('detected_text'):
-#                         ocr_attrs = ProductAttributeService.extract_attributes_from_ocr(
-#                             ocr_results, model
-#                         )
-#                         ocr_results['extracted_attributes'] = ocr_attrs
-                        
-#                         # Format OCR text for combining with product text
-#                         ocr_text = "\n".join([
-#                             f"{item['text']} (confidence: {item['confidence']:.2f})"
-#                             for item in ocr_results['detected_text']
-#                         ])
-                
-#                 # Combine all product information with source tracking
-#                 product_text, source_map = ProductAttributeService.combine_product_text(
-#                     title=product_data.get('title'),
-#                     short_desc=product_data.get('short_desc'),
-#                     long_desc=product_data.get('long_desc'),
-#                     ocr_text=ocr_text
-#                 )
-                
-#                 # Extract attributes from combined text with enhanced features
-#                 result = ProductAttributeService.extract_attributes(
-#                     product_text=product_text,
-#                     mandatory_attrs=mandatory_attrs,
-#                     source_map=source_map,
-#                     model=model,
-#                     extract_additional=extract_additional,
-#                     multiple=multiple,
-#                     threshold_abs=threshold_abs,
-#                     margin=margin,
-#                     use_dynamic_thresholds=use_dynamic_thresholds,
-#                     use_adaptive_margin=use_adaptive_margin,
-#                     use_semantic_clustering=use_semantic_clustering,
-#                     use_cache=use_cache  # ⚡ Pass cache flag
-#                 )
-                
-#                 result['product_id'] = product_id
-                
-#                 # Add OCR results if available (already in correct format)
-#                 if ocr_results:
-#                     result['ocr_results'] = ocr_results
-                
-#                 # Check if extraction was successful
-#                 if 'error' not in result:
-#                     return result, True
-#                 else:
-#                     return result, False
-                    
-#             except Exception as e:
-#                 logger.error(f"Error processing product {product_id}: {str(e)}")
-#                 return {
-#                     'product_id': product_id,
-#                     'mandatory': {attr: [{"value": "Not Specified", "source": "error"}] for attr in mandatory_attrs.keys()},
-#                     'additional': {} if extract_additional else None,
-#                     'error': f"Processing error: {str(e)}"
-#                 }, False
-
-#         # Process products in parallel
-#         with ThreadPoolExecutor(max_workers=max_workers) as executor:
-#             future_to_product = {
-#                 executor.submit(process_product, product): product 
-#                 for product in products
-#             }
-            
-#             for future in as_completed(future_to_product):
-#                 try:
-#                     result, success = future.result()
-#                     results.append(result)
-#                     if success:
-#                         successful += 1
-#                     else:
-#                         failed += 1
-#                 except Exception as e:
-#                     logger.error(f"Future execution error: {str(e)}")
-#                     failed += 1
-#                     results.append({
-#                         'product_id': 'unknown',
-#                         'mandatory': {attr: [{"value": "Not Specified", "source": "error"}] for attr in mandatory_attrs.keys()},
-#                         'additional': {} if extract_additional else None,
-#                         'error': f"Unexpected error: {str(e)}"
-#                     })
-
-#         return {
-#             'results': results,
-#             'total_products': len(products),
-#             'successful': successful,
-#             'failed': failed,
-#             'cache_stats': SimpleCache.get_stats(),  # ⚡ Include cache statistics
-#             'embedding_cache_stats': EmbeddingCache.get_stats()  # ⚡ Include embedding cache stats
-#         }
-
-#     @staticmethod
-#     def _clean_json_response(text: str) -> str:
-#         """Clean LLM response to extract valid JSON."""
-#         start_idx = text.find('{')
-#         end_idx = text.rfind('}')
-
-#         if start_idx != -1 and end_idx != -1:
-#             text = text[start_idx:end_idx + 1]
-
-#         if "```json" in text:
-#             text = text.split("```json")[1].split("```")[0].strip()
-#         elif "```" in text:
-#             text = text.split("```")[1].split("```")[0].strip()
-#             if text.startswith("json"):
-#                 text = text[4:].strip()
-
-#         return text
-
-#     @staticmethod
-#     def _validate_response_structure(
-#         parsed: dict,
-#         mandatory_attrs: Dict[str, List[str]],
-#         extract_additional: bool,
-#         source_map: Dict[str, str] = None
-#     ) -> dict:
-#         """Validate and fix the response structure, ensuring array format with source tracking."""
-#         if source_map is None:
-#             source_map = {}
-        
-#         expected_sections = ["mandatory"]
-#         if extract_additional:
-#             expected_sections.append("additional")
-
-#         if not all(section in parsed for section in expected_sections):
-#             if isinstance(parsed, dict):
-#                 mandatory_keys = set(mandatory_attrs.keys())
-#                 mandatory = {k: v for k, v in parsed.items() if k in mandatory_keys}
-#                 additional = {k: v for k, v in parsed.items() if k not in mandatory_keys}
-
-#                 result = {"mandatory": mandatory}
-#                 if extract_additional:
-#                     result["additional"] = additional
-#                 parsed = result
-#             else:
-#                 return ProductAttributeService._create_error_response(
-#                     "Invalid response structure",
-#                     mandatory_attrs,
-#                     extract_additional,
-#                     str(parsed)
-#                 )
-
-#         # Convert mandatory attributes to array format with source tracking
-#         if "mandatory" in parsed:
-#             converted_mandatory = {}
-#             for attr, value in parsed["mandatory"].items():
-#                 if isinstance(value, list):
-#                     # Already in array format, ensure each item has source
-#                     converted_mandatory[attr] = []
-#                     for item in value:
-#                         if isinstance(item, dict) and "value" in item:
-#                             # Already has proper structure
-#                             if "source" not in item:
-#                                 item["source"] = ProductAttributeService.find_value_source(
-#                                     item["value"], source_map, attr
-#                                 )
-#                             converted_mandatory[attr].append(item)
-#                         else:
-#                             # Convert string to proper format
-#                             source = ProductAttributeService.find_value_source(str(item), source_map, attr)
-#                             converted_mandatory[attr].append({"value": str(item), "source": source})
-#                 else:
-#                     # Single value - convert to array format
-#                     source = ProductAttributeService.find_value_source(str(value), source_map, attr)
-#                     converted_mandatory[attr] = [{"value": str(value), "source": source}]
-            
-#             parsed["mandatory"] = converted_mandatory
-
-#         return parsed
-
-#     @staticmethod
-#     def _create_error_response(
-#         error: str,
-#         mandatory_attrs: Dict[str, List[str]],
-#         extract_additional: bool,
-#         raw_output: Optional[str] = None
-#     ) -> dict:
-#         """Create a standardized error response in array format."""
-#         response = {
-#             "mandatory": {attr: [{"value": "Not Specified", "source": "error"}] for attr in mandatory_attrs.keys()},
-#             "error": error
-#         }
-#         if extract_additional:
-#             response["additional"] = {}
-#         if raw_output:
-#             response["raw_output"] = raw_output
-#         return response
-
-#     @staticmethod
-#     def get_cache_stats() -> Dict:
-#         """
-#         Get statistics for both caches.
-#         ⚡ NEW: Utility method to monitor cache performance
-#         """
-#         return {
-#             "simple_cache": SimpleCache.get_stats(),
-#             "embedding_cache": EmbeddingCache.get_stats()
-#         }
-
-#     @staticmethod
-#     def clear_all_caches():
-#         """
-#         Clear both caches.
-#         ⚡ NEW: Utility method to reset caches when needed
-#         """
-#         SimpleCache.clear()
-#         EmbeddingCache.clear()
-#         logger.info("All caches cleared")
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
 
 # ==================== services.py (PERFORMANCE OPTIMIZED) ====================
 import requests

+ 59 - 516
attr_extraction/views.py

@@ -1,40 +1,77 @@
+# ==================== Cleaned & Optimized Imports ====================
+
+import os
+import io
+import json
+import logging
+import threading
+from datetime import datetime
+import pandas as pd
+import concurrent.futures
+
+from django.conf import settings
+from django.http import HttpResponse
+from django.db import transaction
+from django.db.models import Prefetch
+
 from rest_framework.views import APIView
 from rest_framework.response import Response
 from rest_framework import status
 from rest_framework.parsers import MultiPartParser, FormParser
-from django.db import transaction
-import pandas as pd
-from .models import Product, ProductType, ProductAttribute, AttributePossibleValue
+
+from openpyxl import Workbook
+from openpyxl.styles import Font, PatternFill, Alignment
+
+# --- Local imports ---
+from .models import (
+    Product,
+    ProductType,
+    ProductAttribute,
+    ProductAttributeValue,
+    AttributePossibleValue,
+)
 from .serializers import (
-    SingleProductRequestSerializer,
-    BatchProductRequestSerializer,
-    ProductAttributeResultSerializer, 
-    BatchProductResponseSerializer,
     ProductSerializer,
     ProductTypeSerializer,
     ProductAttributeSerializer,
-    AttributePossibleValueSerializer
+    AttributePossibleValueSerializer,
+    SingleProductRequestSerializer,
+    BatchProductRequestSerializer,
+    ProductAttributeResultSerializer,
+    BatchProductResponseSerializer,
+    ProductAttributeValueSerializer,
+    ProductAttributeValueInputSerializer,
+    BulkProductAttributeValueSerializer,
+    ProductWithAttributesSerializer,
 )
 from .services import ProductAttributeService
 from .ocr_service import OCRService
+from .visual_processing_service import VisualProcessingService
 
+# --- Configuration for Generated Outputs Folder ---
+OUTPUT_FOLDER_NAME = 'generated_outputs'
+OUTPUT_ROOT = os.path.join(settings.MEDIA_ROOT, OUTPUT_FOLDER_NAME)
+OUTPUT_URL = os.path.join(settings.MEDIA_URL, OUTPUT_FOLDER_NAME).replace('\\', '/')  # Ensure forward slashes
 
+LOG_FILE_NAME = 'excel_generation.log'
+STATUS_FILE_NAME = 'excel_generation_status.json'
+EXCEL_FILE_NAME = 'generated_products.xlsx'
 
-# Sample test images (publicly available)
-SAMPLE_IMAGES = {
-    "tshirt": "https://images.unsplash.com/photo-1521572163474-6864f9cf17ab",
-    "dress": "https://images.unsplash.com/photo-1595777457583-95e059d581b8",
-    "jeans": "https://images.unsplash.com/photo-1542272604-787c3835535d"
-}
+LOG_FILE_PATH = os.path.join(OUTPUT_ROOT, LOG_FILE_NAME)
+STATUS_FILE_PATH = os.path.join(OUTPUT_ROOT, STATUS_FILE_NAME)
+EXCEL_FILE_PATH = os.path.join(OUTPUT_ROOT, EXCEL_FILE_NAME)
 
-# ==================== Updated views.py ====================
-from rest_framework.views import APIView
-from rest_framework.response import Response
-from rest_framework import status
-from .models import Product
-from .services import ProductAttributeService
-from .ocr_service import OCRService
-from .visual_processing_service import VisualProcessingService
+# Ensure the output folder exists
+if not os.path.exists(OUTPUT_ROOT):
+    os.makedirs(OUTPUT_ROOT)
+
+# Configure logging
+logging.basicConfig(
+    filename=LOG_FILE_PATH,
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
 
 
 class ExtractProductAttributesView(APIView):
@@ -131,398 +168,7 @@ class ExtractProductAttributesView(APIView):
 
         return Response(result, status=status.HTTP_200_OK)
 
-
-
-# Replace the BatchExtractProductAttributesView in your views.py with this updated version
-
-# class BatchExtractProductAttributesView(APIView):
-#     """
-#     API endpoint to extract product attributes for multiple products in batch.
-#     Uses item-specific mandatory_attrs with source tracking.
-#     Returns attributes in array format with original_value field.
-#     Includes OCR and Visual Processing results.
-#     """
-
-#     def post(self, request):
-#         serializer = BatchProductRequestSerializer(data=request.data)
-#         if not serializer.is_valid():
-#             return Response({"error": serializer.errors}, status=status.HTTP_400_BAD_REQUEST)
-
-#         validated_data = serializer.validated_data
-        
-#         # Get batch-level settings
-#         product_list = validated_data.get("products", [])
-#         model = validated_data.get("model")
-#         extract_additional = validated_data.get("extract_additional", True)
-#         process_image = validated_data.get("process_image", True)
-#         multiple = validated_data.get("multiple", [])
-#         threshold_abs = validated_data.get("threshold_abs", 0.65)
-#         margin = validated_data.get("margin", 0.15)
-#         use_dynamic_thresholds = validated_data.get("use_dynamic_thresholds", True)
-#         use_adaptive_margin = validated_data.get("use_adaptive_margin", True)
-#         use_semantic_clustering = validated_data.get("use_semantic_clustering", True)
-        
-#         # Extract all item_ids to query the database efficiently
-#         item_ids = [p['item_id'] for p in product_list] 
-        
-#         # Fetch all products in one query
-#         products_queryset = Product.objects.filter(item_id__in=item_ids)
-#         product_map = {product.item_id: product for product in products_queryset}
-#         found_ids = set(product_map.keys())
-        
-#         # Fetch all original attribute values for these products in one query
-#         original_values_qs = ProductAttributeValue.objects.filter(
-#             product__item_id__in=item_ids
-#         ).select_related('product')
-        
-#         # Create a nested dictionary: {item_id: {attribute_name: original_value}}
-#         original_values_map = {}
-#         for attr_val in original_values_qs:
-#             item_id = attr_val.product.item_id
-#             if item_id not in original_values_map:
-#                 original_values_map[item_id] = {}
-#             original_values_map[item_id][attr_val.attribute_name] = attr_val.original_value
-        
-#         results = []
-#         successful = 0
-#         failed = 0
-
-#         for product_entry in product_list:
-#             item_id = product_entry['item_id']
-#             mandatory_attrs = product_entry['mandatory_attrs'] 
-
-#             if item_id not in found_ids:
-#                 failed += 1
-#                 results.append({
-#                     "product_id": item_id,
-#                     "error": "Product not found in database"
-#                 })
-#                 continue
-
-#             product = product_map[item_id]
-            
-#             try: 
-#                 title = product.product_name
-#                 short_desc = product.product_short_description
-#                 long_desc = product.product_long_description
-#                 image_url = product.image_path
-                
-#                 ocr_results = None
-#                 ocr_text = None
-#                 visual_results = None
-
-#                 # Image Processing Logic
-#                 if process_image and image_url:
-#                     # OCR Processing
-#                     ocr_service = OCRService()
-#                     ocr_results = ocr_service.process_image(image_url)
-                    
-#                     if ocr_results and ocr_results.get("detected_text"):
-#                         ocr_attrs = ProductAttributeService.extract_attributes_from_ocr(
-#                             ocr_results, model
-#                         )
-#                         ocr_results["extracted_attributes"] = ocr_attrs
-#                         ocr_text = "\n".join([
-#                             f"{item['text']} (confidence: {item['confidence']:.2f})"
-#                             for item in ocr_results["detected_text"]
-#                         ])
-                    
-#                     # Visual Processing
-#                     visual_service = VisualProcessingService()
-#                     product_type_hint = product.product_type if hasattr(product, 'product_type') else None
-#                     visual_results = visual_service.process_image(image_url, product_type_hint)
-                    
-#                     # Format visual attributes to array format with source tracking
-#                     if visual_results and visual_results.get('visual_attributes'):
-#                         visual_results['visual_attributes'] = ProductAttributeService.format_visual_attributes(
-#                             visual_results['visual_attributes']
-#                         )
-
-#                 # Combine product text with source tracking
-#                 product_text, source_map = ProductAttributeService.combine_product_text(
-#                     title=title,
-#                     short_desc=short_desc,
-#                     long_desc=long_desc,
-#                     ocr_text=ocr_text
-#                 )
-
-#                 # Attribute Extraction with source tracking (returns array format)
-#                 extracted = ProductAttributeService.extract_attributes(
-#                     product_text=product_text,
-#                     mandatory_attrs=mandatory_attrs,
-#                     source_map=source_map,
-#                     model=model,
-#                     extract_additional=extract_additional,
-#                     multiple=multiple,
-#                     threshold_abs=threshold_abs,
-#                     margin=margin,
-#                     use_dynamic_thresholds=use_dynamic_thresholds,
-#                     use_adaptive_margin=use_adaptive_margin,
-#                     use_semantic_clustering=use_semantic_clustering
-#                 )
-
-#                 # Add original_value to each extracted attribute
-#                 original_attrs = original_values_map.get(item_id, {})
-                
-#                 # Process mandatory attributes
-#                 for attr_name, attr_values in extracted.get("mandatory", {}).items():
-#                     if isinstance(attr_values, list):
-#                         for attr_obj in attr_values:
-#                             if isinstance(attr_obj, dict):
-#                                 # Add original_value if it exists
-#                                 attr_obj["original_value"] = original_attrs.get(attr_name, "")
-                
-#                 # Process additional attributes
-#                 for attr_name, attr_values in extracted.get("additional", {}).items():
-#                     if isinstance(attr_values, list):
-#                         for attr_obj in attr_values:
-#                             if isinstance(attr_obj, dict):
-#                                 # Add original_value if it exists
-#                                 attr_obj["original_value"] = original_attrs.get(attr_name, "")
-
-#                 result = {
-#                     "product_id": product.item_id,
-#                     "mandatory": extracted.get("mandatory", {}),
-#                     "additional": extracted.get("additional", {}),
-#                 }
-
-#                 # Attach OCR results if available
-#                 if ocr_results:
-#                     result["ocr_results"] = ocr_results
-                
-#                 # Attach Visual Processing results if available
-#                 if visual_results:
-#                     result["visual_results"] = visual_results
-
-#                 results.append(result)
-#                 successful += 1
-
-#             except Exception as e:
-#                 failed += 1
-#                 results.append({
-#                     "product_id": item_id,
-#                     "error": str(e)
-#                 })
-
-#         batch_result = {
-#             "results": results,
-#             "total_products": len(product_list),
-#             "successful": successful,
-#             "failed": failed
-#         }
-
-#         response_serializer = BatchProductResponseSerializer(data=batch_result)
-#         if response_serializer.is_valid():
-#             return Response(response_serializer.data, status=status.HTTP_200_OK)
-
-#         return Response(batch_result, status=status.HTTP_200_OK)
-
-
-# views.py - OPTIMIZED WITHOUT REDIS/CELERY
-
-# class BatchExtractProductAttributesView(APIView):
-#     """
-#     Optimized batch extraction using ThreadPoolExecutor (built-in Python)
-#     """
-
-#     def post(self, request):
-#         serializer = BatchProductRequestSerializer(data=request.data)
-#         if not serializer.is_valid():
-#             return Response({"error": serializer.errors}, status=status.HTTP_400_BAD_REQUEST)
-
-#         validated_data = serializer.validated_data
-#         product_list = validated_data.get("products", [])
-        
-#         # OPTIMIZATION 1: Single optimized database query
-#         item_ids = [p['item_id'] for p in product_list]
-#         products_queryset = Product.objects.filter(
-#             item_id__in=item_ids
-#         ).prefetch_related('attribute_values')  # Single query!
-        
-#         product_map = {product.item_id: product for product in products_queryset}
-        
-#         # OPTIMIZATION 2: Prefetch ALL original attribute values in ONE query
-#         original_values_qs = ProductAttributeValue.objects.filter(
-#             product__item_id__in=item_ids
-#         ).select_related('product')
-        
-#         original_values_map = {}
-#         for attr_val in original_values_qs:
-#             item_id = attr_val.product.item_id
-#             if item_id not in original_values_map:
-#                 original_values_map[item_id] = {}
-#             original_values_map[item_id][attr_val.attribute_name] = attr_val.original_value
-        
-#         # Extract settings
-#         model = validated_data.get("model")
-#         extract_additional = validated_data.get("extract_additional", True)
-#         process_image = validated_data.get("process_image", True)
-#         multiple = validated_data.get("multiple", [])
-#         threshold_abs = validated_data.get("threshold_abs", 0.65)
-#         margin = validated_data.get("margin", 0.15)
-#         use_dynamic_thresholds = validated_data.get("use_dynamic_thresholds", True)
-#         use_adaptive_margin = validated_data.get("use_adaptive_margin", True)
-#         use_semantic_clustering = validated_data.get("use_semantic_clustering", True)
-        
-#         results = []
-#         successful = 0
-#         failed = 0
-        
-#         # OPTIMIZATION 3: Initialize services once
-#         ocr_service = OCRService() if process_image else None
-#         visual_service = VisualProcessingService() if process_image else None
-
-#         # OPTIMIZATION 4: Process in parallel using ThreadPoolExecutor
-#         def process_single_product(product_entry):
-#             """Process a single product (runs in parallel)"""
-#             item_id = product_entry['item_id']
-#             mandatory_attrs = product_entry['mandatory_attrs']
-
-#             if item_id not in product_map:
-#                 return {
-#                     "product_id": item_id,
-#                     "error": "Product not found in database"
-#                 }, False
-
-#             product = product_map[item_id]
-            
-#             try:
-#                 title = product.product_name
-#                 short_desc = product.product_short_description
-#                 long_desc = product.product_long_description
-#                 image_url = product.image_path
-                
-#                 ocr_results = None
-#                 ocr_text = None
-#                 visual_results = None
-
-#                 # Image processing (if enabled)
-#                 if process_image and image_url:
-#                     if ocr_service:
-#                         ocr_results = ocr_service.process_image(image_url)
-                        
-#                         if ocr_results and ocr_results.get("detected_text"):
-#                             ocr_attrs = ProductAttributeService.extract_attributes_from_ocr(
-#                                 ocr_results, model
-#                             )
-#                             ocr_results["extracted_attributes"] = ocr_attrs
-#                             ocr_text = "\n".join([
-#                                 f"{item['text']} (confidence: {item['confidence']:.2f})"
-#                                 for item in ocr_results["detected_text"]
-#                             ])
-                    
-#                     if visual_service:
-#                         product_type_hint = product.product_type if hasattr(product, 'product_type') else None
-#                         visual_results = visual_service.process_image(image_url, product_type_hint)
-                        
-#                         if visual_results and visual_results.get('visual_attributes'):
-#                             visual_results['visual_attributes'] = ProductAttributeService.format_visual_attributes(
-#                                 visual_results['visual_attributes']
-#                             )
-
-#                 # Combine product text with source tracking
-#                 product_text, source_map = ProductAttributeService.combine_product_text(
-#                     title=title,
-#                     short_desc=short_desc,
-#                     long_desc=long_desc,
-#                     ocr_text=ocr_text
-#                 )
-
-#                 # Extract attributes (WITH CACHING ENABLED)
-#                 extracted = ProductAttributeService.extract_attributes(
-#                     product_text=product_text,
-#                     mandatory_attrs=mandatory_attrs,
-#                     source_map=source_map,
-#                     model=model,
-#                     extract_additional=extract_additional,
-#                     multiple=multiple,
-#                     threshold_abs=threshold_abs,
-#                     margin=margin,
-#                     use_dynamic_thresholds=use_dynamic_thresholds,
-#                     use_adaptive_margin=use_adaptive_margin,
-#                     use_semantic_clustering=use_semantic_clustering,
-#                     use_cache=True  # Enable caching!
-#                 )
-
-#                 # Add original values
-#                 original_attrs = original_values_map.get(item_id, {})
-                
-#                 for attr_name, attr_values in extracted.get("mandatory", {}).items():
-#                     if isinstance(attr_values, list):
-#                         for attr_obj in attr_values:
-#                             if isinstance(attr_obj, dict):
-#                                 attr_obj["original_value"] = original_attrs.get(attr_name, "")
-                
-#                 for attr_name, attr_values in extracted.get("additional", {}).items():
-#                     if isinstance(attr_values, list):
-#                         for attr_obj in attr_values:
-#                             if isinstance(attr_obj, dict):
-#                                 attr_obj["original_value"] = original_attrs.get(attr_name, "")
-
-#                 result = {
-#                     "product_id": product.item_id,
-#                     "mandatory": extracted.get("mandatory", {}),
-#                     "additional": extracted.get("additional", {}),
-#                 }
-
-#                 if ocr_results:
-#                     result["ocr_results"] = ocr_results
-                
-#                 if visual_results:
-#                     result["visual_results"] = visual_results
-
-#                 return result, True
-
-#             except Exception as e:
-#                 return {
-#                     "product_id": item_id,
-#                     "error": str(e)
-#                 }, False
-
-#         # OPTIMIZATION 5: Use ThreadPoolExecutor for parallel processing
-#         import concurrent.futures
-#         max_workers = min(10, len(product_list))  # Up to 10 parallel workers
-        
-#         with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-#             # Submit all tasks
-#             future_to_product = {
-#                 executor.submit(process_single_product, product): product
-#                 for product in product_list
-#             }
-            
-#             # Collect results as they complete
-#             for future in concurrent.futures.as_completed(future_to_product):
-#                 try:
-#                     result, success = future.result()
-#                     results.append(result)
-#                     if success:
-#                         successful += 1
-#                     else:
-#                         failed += 1
-#                 except Exception as e:
-#                     failed += 1
-#                     logger.error(f"Unexpected error: {str(e)}")
-#                     results.append({
-#                         "product_id": "unknown",
-#                         "error": str(e)
-#                     })
-
-#         batch_result = {
-#             "results": results,
-#             "total_products": len(product_list),
-#             "successful": successful,
-#             "failed": failed
-#         }
-
-#         response_serializer = BatchProductResponseSerializer(data=batch_result)
-#         if response_serializer.is_valid():
-#             return Response(response_serializer.data, status=status.HTTP_200_OK)
-
-#         return Response(batch_result, status=status.HTTP_200_OK)
-
-
 # ==================== OPTIMIZED BATCH VIEW ====================
-import concurrent.futures
 
 class BatchExtractProductAttributesView(APIView):
     """
@@ -773,7 +419,6 @@ class BatchExtractProductAttributesView(APIView):
         return Response(batch_result, status=status.HTTP_200_OK)
 
 
-
 class ProductListView(APIView):
     """
     GET API to list all products with details
@@ -783,80 +428,6 @@ class ProductListView(APIView):
         serializer = ProductSerializer(products, many=True)
         return Response(serializer.data, status=status.HTTP_200_OK)
 
-
-from rest_framework.views import APIView
-from rest_framework.response import Response
-from rest_framework import status
-from rest_framework.parsers import MultiPartParser, FormParser
-import pandas as pd
-from .models import Product
-
-
-from rest_framework.views import APIView
-from rest_framework.response import Response
-from rest_framework import status
-from rest_framework.parsers import MultiPartParser, FormParser
-from django.db import transaction
-import pandas as pd
-from .models import Product, ProductAttributeValue
-
-
-
-
-
-
-
-
-
-
-
-import logging
-import json
-from rest_framework.views import APIView
-from rest_framework.response import Response
-from rest_framework import status
-from rest_framework.parsers import MultiPartParser, FormParser
-from django.db import transaction
-from django.db.models import Prefetch
-import pandas as pd
-# Import ALL your models
-from .models import Product, ProductAttributeValue, ProductType, ProductAttribute, AttributePossibleValue
-from .services import ProductAttributeService
-from .ocr_service import OCRService
-from .visual_processing_service import VisualProcessingService
-from openpyxl import Workbook
-from openpyxl.styles import Font, PatternFill, Alignment
-from django.conf import settings
-import os
-import threading
-from datetime import datetime
-
-# --- Configuration for Generated Outputs Folder ---
-OUTPUT_FOLDER_NAME = 'generated_outputs'
-OUTPUT_ROOT = os.path.join(settings.MEDIA_ROOT, OUTPUT_FOLDER_NAME)
-OUTPUT_URL = os.path.join(settings.MEDIA_URL, OUTPUT_FOLDER_NAME).replace('\\', '/') # Use forward slashes for URL
-
-# Define log and status file paths within the new subfolder
-LOG_FILE_NAME = 'excel_generation.log'
-STATUS_FILE_NAME = 'excel_generation_status.json'
-EXCEL_FILE_NAME = 'generated_products.xlsx'
-
-LOG_FILE_PATH = os.path.join(OUTPUT_ROOT, LOG_FILE_NAME)
-STATUS_FILE_PATH = os.path.join(OUTPUT_ROOT, STATUS_FILE_NAME)
-EXCEL_FILE_PATH = os.path.join(OUTPUT_ROOT, EXCEL_FILE_NAME)
-
-# Ensure the OUTPUT_ROOT exists for files to be saved
-if not os.path.exists(OUTPUT_ROOT):
-    os.makedirs(OUTPUT_ROOT)
-
-# Configure basic logging to the new path
-logging.basicConfig(
-    filename=LOG_FILE_PATH,
-    level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - %(message)s'
-)
-logger = logging.getLogger(__name__)
-
 # -------------------------------------------------------------------------------------------------
 
 def generate_product_excel_background():
@@ -1199,7 +770,6 @@ def generate_product_excel_background():
         logger.exception("CRITICAL ERROR during background Excel generation process.")
         write_status("FAILED", error_msg=str(e))
 
-
 # -------------------------------------------------------------------------------------------------
 
 class ProductUploadExcelView(APIView):
@@ -1381,15 +951,6 @@ class ProductUploadExcelView(APIView):
             return Response({'error': f'An unexpected error occurred while processing the file: {str(e)}'}, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
 
 
-# Add this view to your views.py for downloading a template
-
-from django.http import HttpResponse
-from openpyxl import Workbook
-from openpyxl.styles import Font, PatternFill, Alignment
-from rest_framework.views import APIView
-import io
-
-
 class DownloadExcelTemplateView(APIView):
     """
     GET API to download an Excel template with two sheets:
@@ -1921,24 +1482,6 @@ class ProductTypeListView(APIView):
         return Response({"product_types": list(product_types)}, status=status.HTTP_200_OK)
     
 
-
-# Add these views to your views.py
-
-from rest_framework.views import APIView
-from rest_framework.response import Response
-from rest_framework import status
-from rest_framework.parsers import MultiPartParser, FormParser
-from django.db import transaction
-import pandas as pd
-from .models import Product, ProductAttributeValue
-from .serializers import (
-    ProductAttributeValueSerializer,
-    ProductAttributeValueInputSerializer,
-    BulkProductAttributeValueSerializer,
-    ProductWithAttributesSerializer
-)
-
-
 class ProductAttributeValueView(APIView):
     """
     API to manage manually entered original attribute values.