hace 3 meses · b350ab3eb2
--- a/attr_extraction/services.py
+++ b/attr_extraction/services.py
@@ -2807,22 +2807,34 @@ class ProductAttributeService:
 
															         if len(float_numbers) < 2:
														
 
															             return ""
														
 
															-        # If we have 3 dimensions, remove the smallest one (usually thickness/depth)
														
 
															-        # This handles cases like "16 x 1.5 x 20" -> keep 16 and 20, drop 1.5
														
 
															-        if len(float_numbers) >= 3:
														
 
															-            # Sort and remove the smallest
														
 
															-            float_numbers_sorted = sorted(float_numbers, reverse=True)
														
 
															-            float_numbers = float_numbers_sorted[:2]
														
 
															-        
														
 
															-        # Convert to integers for dimension matching
														
 
															-        int_numbers = [int(num) for num in float_numbers]
														
 
															+        # If we have 3 dimensions, it's likely Width x Depth x Height
														
 
															+        # For wall art, depth is usually small (< 5), so we keep first and last
														
 
															+        if len(float_numbers) == 3:
														
 
															+            # Keep first and last values (width and height), skip middle (depth)
														
 
															+            float_numbers = [float_numbers[0], float_numbers[2]]
														
 
															+        elif len(float_numbers) > 3:
														
 
															+            # If more than 3 dimensions, keep the two largest
														
 
															+            float_numbers = sorted(float_numbers)[-2:]
														
 
															+        else:
														
 
															+            # Just 2 dimensions, use as is
														
 
															+            float_numbers = float_numbers[:2]
														
 
															+        
														
 
															+        # Format numbers: use integer if whole, else one decimal
														
 
															+        formatted_numbers = []
														
 
															+        for num in float_numbers:
														
 
															+            if num.is_integer():
														
 
															+                formatted_numbers.append(str(int(num)))
														
 
															+            else:
														
 
															+                formatted_numbers.append(f"{num:.1f}")
														
 
															         # Sort to ensure consistent order (smaller x larger)
														
 
															-        int_numbers.sort()
														
 
															+        formatted_numbers.sort(key=lambda x: float(x))
														
 
															         # Return formatted dimension
														
 
															-        return f"{int_numbers[0]}x{int_numbers[1]}"
														
 
															+        return f"{formatted_numbers[0]}x{formatted_numbers[1]}"
														
 
															+    
														
 
															+    
														
 
															     @staticmethod
														
 
															     def normalize_value_for_matching(value: str, attr_name: str = "") -> str:
														
 
															         """
														
@@ -3222,20 +3234,20 @@ If an attribute is not present, do not include it in the response.
 
															     @staticmethod
														
 
															     def normalize_against_product_text(
														
 
															-        product_text: str,
														
 
															-        mandatory_attrs: Dict[str, List[str]],
														
 
															-        source_map: Dict[str, str],
														
 
															-        threshold_abs: float = 0.65,
														
 
															-        margin: float = 0.15,
														
 
															-        allow_multiple: bool = False,
														
 
															-        sem_weight: float = 0.8,
														
 
															-        lex_weight: float = 0.2,
														
 
															-        extracted_attrs: Optional[Dict[str, List[Dict[str, str]]]] = None,
														
 
															-        relationships: Optional[Dict[str, float]] = None,
														
 
															-        use_dynamic_thresholds: bool = True,
														
 
															-        use_adaptive_margin: bool = True,
														
 
															-        use_semantic_clustering: bool = True
														
 
															-    ) -> dict:
														
 
															+    product_text: str,
														
 
															+    mandatory_attrs: Dict[str, List[str]],
														
 
															+    source_map: Dict[str, str],
														
 
															+    threshold_abs: float = 0.65,
														
 
															+    margin: float = 0.15,
														
 
															+    allow_multiple: bool = False,
														
 
															+    sem_weight: float = 0.8,
														
 
															+    lex_weight: float = 0.2,
														
 
															+    extracted_attrs: Optional[Dict[str, List[Dict[str, str]]]] = None,
														
 
															+    relationships: Optional[Dict[str, float]] = None,
														
 
															+    use_dynamic_thresholds: bool = True,
														
 
															+    use_adaptive_margin: bool = True,
														
 
															+    use_semantic_clustering: bool = True
														
 
															+) -> dict:
														
 
															         """
														
 
															         Score each allowed value against the product_text with dynamic thresholds.
														
 
															         Returns dict with values in array format: [{"value": "...", "source": "..."}]
														
@@ -3251,26 +3263,42 @@ If an attribute is not present, do not include it in the response.
 
															         for attr, allowed_values in mandatory_attrs.items():
														
 
															             scores: List[Tuple[str, float]] = []
														
 
															-            # Normalize product text for dimension matching if needed
														
 
															-            normalized_product_text = product_text
														
 
															+            # Check if this is a dimension attribute
														
 
															             is_dimension_attr = any(keyword in attr.lower() for keyword in ['dimension', 'size', 'measurement'])
														
 
															+            
														
 
															+            # Normalize product text once for dimension matching
														
 
															+            normalized_product_text = ProductAttributeService.normalize_dimension_text(product_text) if is_dimension_attr else ""
														
 
															             for val in allowed_values:
														
 
															-                # For dimension attributes, try normalized matching first
														
 
															+                # For dimension attributes, try exact normalized matching first
														
 
															                 if is_dimension_attr:
														
 
															+                    # Normalize the allowed value from the list
														
 
															                     normalized_val = ProductAttributeService.normalize_dimension_text(val)
														
 
															-                    normalized_pt = ProductAttributeService.normalize_dimension_text(product_text)
														
 
															-                    # If we find exact normalized match, give it highest score
														
 
															-                    if normalized_val and normalized_pt and normalized_val == normalized_pt:
														
 
															-                        scores.append((val, 0.99))
														
 
															-                        continue
														
 
															-                    # Also check if dimension appears anywhere in the text
														
 
															-                    elif normalized_val and normalized_val in normalized_pt:
														
 
															-                        scores.append((val, 0.95))
														
 
															+                    # If we have both normalized values and they match exactly, give highest score
														
 
															+                    if normalized_val and normalized_product_text and normalized_val == normalized_product_text:
														
 
															+                        scores.append((val, 1.0))
														
 
															                         continue
														
 
															+                    
														
 
															+                    # Also check if the normalized value appears in the original product text
														
 
															+                    # This handles cases where the format might be slightly different
														
 
															+                    if normalized_val:
														
 
															+                        # Extract just the numbers for flexible matching
														
 
															+                        val_numbers = normalized_val.split('x')
														
 
															+                        # Check if both numbers appear in the product text in close proximity
														
 
															+                        text_lower = product_text.lower()
														
 
															+                        if all(num in text_lower for num in val_numbers):
														
 
															+                            # Calculate proximity score
														
 
															+                            idx1 = text_lower.find(val_numbers[0])
														
 
															+                            idx2 = text_lower.find(val_numbers[1])
														
 
															+                            if idx1 != -1 and idx2 != -1:
														
 
															+                                distance = abs(idx2 - idx1)
														
 
															+                                # If numbers are close together (within 20 characters), high score
														
 
															+                                if distance < 20:
														
 
															+                                    scores.append((val, 0.95))
														
 
															+                                    continue
														
 
															-                # Standard semantic matching
														
 
															+                # Standard semantic matching for all attributes
														
 
															                 contexts = [val, f"for {val}", f"use in {val}", f"suitable for {val}", f"{val} room"]
														
 
															                 ctx_embs = [model_embedder.encode(c, convert_to_tensor=True) for c in contexts]
														
 
															                 sem_sim = max(float(util.cos_sim(pt_emb, ce).item()) for ce in ctx_embs)
														
@@ -3311,8 +3339,7 @@ If an attribute is not present, do not include it in the response.
 
															                     best_cluster = next((c for c in clusters if best_val in c), [best_val])
														
 
															                 for val, sc in scores[1:]:
														
 
															-                    # Skip values with very low scores - they're likely not relevant
														
 
															-                    # For dimension attributes, be even more strict
														
 
															+                    # Skip values with very low scores
														
 
															                     min_score = 0.4 if is_dimension_attr else 0.3
														
 
															                     if sc < min_score:
														
 
															                         continue
														
@@ -3356,6 +3383,7 @@ If an attribute is not present, do not include it in the response.
 
															         return extracted
														
 
															+
														
 
															     @staticmethod
														
 
															     def extract_attributes(
														
 
															         product_text: str,
														
--- a/db.sqlite3
+++ b/db.sqlite3