Student Yadav пре 3 месеци
родитељ
комит
b350ab3eb2
2 измењених фајлова са 67 додато и 39 уклоњено
  1. 67 39
      attr_extraction/services.py
  2. BIN
      db.sqlite3

+ 67 - 39
attr_extraction/services.py

@@ -2807,22 +2807,34 @@ class ProductAttributeService:
         if len(float_numbers) < 2:
             return ""
         
-        # If we have 3 dimensions, remove the smallest one (usually thickness/depth)
-        # This handles cases like "16 x 1.5 x 20" -> keep 16 and 20, drop 1.5
-        if len(float_numbers) >= 3:
-            # Sort and remove the smallest
-            float_numbers_sorted = sorted(float_numbers, reverse=True)
-            float_numbers = float_numbers_sorted[:2]
-        
-        # Convert to integers for dimension matching
-        int_numbers = [int(num) for num in float_numbers]
+        # If we have 3 dimensions, it's likely Width x Depth x Height
+        # For wall art, depth is usually small (< 5), so we keep first and last
+        if len(float_numbers) == 3:
+            # Keep first and last values (width and height), skip middle (depth)
+            float_numbers = [float_numbers[0], float_numbers[2]]
+        elif len(float_numbers) > 3:
+            # If more than 3 dimensions, keep the two largest
+            float_numbers = sorted(float_numbers)[-2:]
+        else:
+            # Just 2 dimensions, use as is
+            float_numbers = float_numbers[:2]
+        
+        # Format numbers: use integer if whole, else one decimal
+        formatted_numbers = []
+        for num in float_numbers:
+            if num.is_integer():
+                formatted_numbers.append(str(int(num)))
+            else:
+                formatted_numbers.append(f"{num:.1f}")
         
         # Sort to ensure consistent order (smaller x larger)
-        int_numbers.sort()
+        formatted_numbers.sort(key=lambda x: float(x))
         
         # Return formatted dimension
-        return f"{int_numbers[0]}x{int_numbers[1]}"
+        return f"{formatted_numbers[0]}x{formatted_numbers[1]}"
 
+    
+    
     @staticmethod
     def normalize_value_for_matching(value: str, attr_name: str = "") -> str:
         """
@@ -3222,20 +3234,20 @@ If an attribute is not present, do not include it in the response.
 
     @staticmethod
     def normalize_against_product_text(
-        product_text: str,
-        mandatory_attrs: Dict[str, List[str]],
-        source_map: Dict[str, str],
-        threshold_abs: float = 0.65,
-        margin: float = 0.15,
-        allow_multiple: bool = False,
-        sem_weight: float = 0.8,
-        lex_weight: float = 0.2,
-        extracted_attrs: Optional[Dict[str, List[Dict[str, str]]]] = None,
-        relationships: Optional[Dict[str, float]] = None,
-        use_dynamic_thresholds: bool = True,
-        use_adaptive_margin: bool = True,
-        use_semantic_clustering: bool = True
-    ) -> dict:
+    product_text: str,
+    mandatory_attrs: Dict[str, List[str]],
+    source_map: Dict[str, str],
+    threshold_abs: float = 0.65,
+    margin: float = 0.15,
+    allow_multiple: bool = False,
+    sem_weight: float = 0.8,
+    lex_weight: float = 0.2,
+    extracted_attrs: Optional[Dict[str, List[Dict[str, str]]]] = None,
+    relationships: Optional[Dict[str, float]] = None,
+    use_dynamic_thresholds: bool = True,
+    use_adaptive_margin: bool = True,
+    use_semantic_clustering: bool = True
+) -> dict:
         """
         Score each allowed value against the product_text with dynamic thresholds.
         Returns dict with values in array format: [{"value": "...", "source": "..."}]
@@ -3251,26 +3263,42 @@ If an attribute is not present, do not include it in the response.
         for attr, allowed_values in mandatory_attrs.items():
             scores: List[Tuple[str, float]] = []
             
-            # Normalize product text for dimension matching if needed
-            normalized_product_text = product_text
+            # Check if this is a dimension attribute
             is_dimension_attr = any(keyword in attr.lower() for keyword in ['dimension', 'size', 'measurement'])
+            
+            # Normalize product text once for dimension matching
+            normalized_product_text = ProductAttributeService.normalize_dimension_text(product_text) if is_dimension_attr else ""
 
             for val in allowed_values:
-                # For dimension attributes, try normalized matching first
+                # For dimension attributes, try exact normalized matching first
                 if is_dimension_attr:
+                    # Normalize the allowed value from the list
                     normalized_val = ProductAttributeService.normalize_dimension_text(val)
-                    normalized_pt = ProductAttributeService.normalize_dimension_text(product_text)
                     
-                    # If we find exact normalized match, give it highest score
-                    if normalized_val and normalized_pt and normalized_val == normalized_pt:
-                        scores.append((val, 0.99))
-                        continue
-                    # Also check if dimension appears anywhere in the text
-                    elif normalized_val and normalized_val in normalized_pt:
-                        scores.append((val, 0.95))
+                    # If we have both normalized values and they match exactly, give highest score
+                    if normalized_val and normalized_product_text and normalized_val == normalized_product_text:
+                        scores.append((val, 1.0))
                         continue
+                    
+                    # Also check if the normalized value appears in the original product text
+                    # This handles cases where the format might be slightly different
+                    if normalized_val:
+                        # Extract just the numbers for flexible matching
+                        val_numbers = normalized_val.split('x')
+                        # Check if both numbers appear in the product text in close proximity
+                        text_lower = product_text.lower()
+                        if all(num in text_lower for num in val_numbers):
+                            # Calculate proximity score
+                            idx1 = text_lower.find(val_numbers[0])
+                            idx2 = text_lower.find(val_numbers[1])
+                            if idx1 != -1 and idx2 != -1:
+                                distance = abs(idx2 - idx1)
+                                # If numbers are close together (within 20 characters), high score
+                                if distance < 20:
+                                    scores.append((val, 0.95))
+                                    continue
                 
-                # Standard semantic matching
+                # Standard semantic matching for all attributes
                 contexts = [val, f"for {val}", f"use in {val}", f"suitable for {val}", f"{val} room"]
                 ctx_embs = [model_embedder.encode(c, convert_to_tensor=True) for c in contexts]
                 sem_sim = max(float(util.cos_sim(pt_emb, ce).item()) for ce in ctx_embs)
@@ -3311,8 +3339,7 @@ If an attribute is not present, do not include it in the response.
                     best_cluster = next((c for c in clusters if best_val in c), [best_val])
 
                 for val, sc in scores[1:]:
-                    # Skip values with very low scores - they're likely not relevant
-                    # For dimension attributes, be even more strict
+                    # Skip values with very low scores
                     min_score = 0.4 if is_dimension_attr else 0.3
                     if sc < min_score:
                         continue
@@ -3356,6 +3383,7 @@ If an attribute is not present, do not include it in the response.
 
         return extracted
 
+
     @staticmethod
     def extract_attributes(
         product_text: str,