пре 3 месеци · b350ab3eb2
--- a/attr_extraction/services.py
+++ b/attr_extraction/services.py
@@ -2807,22 +2807,34 @@ class ProductAttributeService:
 
				         if len(float_numbers) < 2:
			
 
				             return ""
			
 
				         
			
 
				-        # If we have 3 dimensions, remove the smallest one (usually thickness/depth)
			
 
				-        # This handles cases like "16 x 1.5 x 20" -> keep 16 and 20, drop 1.5
			
 
				-        if len(float_numbers) >= 3:
			
 
				-            # Sort and remove the smallest
			
 
				-            float_numbers_sorted = sorted(float_numbers, reverse=True)
			
 
				-            float_numbers = float_numbers_sorted[:2]
			
 
				-        
			
 
				-        # Convert to integers for dimension matching
			
 
				-        int_numbers = [int(num) for num in float_numbers]
			
 
				+        # If we have 3 dimensions, it's likely Width x Depth x Height
			
 
				+        # For wall art, depth is usually small (< 5), so we keep first and last
			
 
				+        if len(float_numbers) == 3:
			
 
				+            # Keep first and last values (width and height), skip middle (depth)
			
 
				+            float_numbers = [float_numbers[0], float_numbers[2]]
			
 
				+        elif len(float_numbers) > 3:
			
 
				+            # If more than 3 dimensions, keep the two largest
			
 
				+            float_numbers = sorted(float_numbers)[-2:]
			
 
				+        else:
			
 
				+            # Just 2 dimensions, use as is
			
 
				+            float_numbers = float_numbers[:2]
			
 
				+        
			
 
				+        # Format numbers: use integer if whole, else one decimal
			
 
				+        formatted_numbers = []
			
 
				+        for num in float_numbers:
			
 
				+            if num.is_integer():
			
 
				+                formatted_numbers.append(str(int(num)))
			
 
				+            else:
			
 
				+                formatted_numbers.append(f"{num:.1f}")
			
 
				         
			
 
				         # Sort to ensure consistent order (smaller x larger)
			
 
				-        int_numbers.sort()
			
 
				+        formatted_numbers.sort(key=lambda x: float(x))
			
 
				         
			
 
				         # Return formatted dimension
			
 
				-        return f"{int_numbers[0]}x{int_numbers[1]}"
			
 
				+        return f"{formatted_numbers[0]}x{formatted_numbers[1]}"
			
 
				 
			
 
				+    
			
 
				+    
			
 
				     @staticmethod
			
 
				     def normalize_value_for_matching(value: str, attr_name: str = "") -> str:
			
 
				         """
			
@@ -3222,20 +3234,20 @@ If an attribute is not present, do not include it in the response.
 
				 
			
 
				     @staticmethod
			
 
				     def normalize_against_product_text(
			
 
				-        product_text: str,
			
 
				-        mandatory_attrs: Dict[str, List[str]],
			
 
				-        source_map: Dict[str, str],
			
 
				-        threshold_abs: float = 0.65,
			
 
				-        margin: float = 0.15,
			
 
				-        allow_multiple: bool = False,
			
 
				-        sem_weight: float = 0.8,
			
 
				-        lex_weight: float = 0.2,
			
 
				-        extracted_attrs: Optional[Dict[str, List[Dict[str, str]]]] = None,
			
 
				-        relationships: Optional[Dict[str, float]] = None,
			
 
				-        use_dynamic_thresholds: bool = True,
			
 
				-        use_adaptive_margin: bool = True,
			
 
				-        use_semantic_clustering: bool = True
			
 
				-    ) -> dict:
			
 
				+    product_text: str,
			
 
				+    mandatory_attrs: Dict[str, List[str]],
			
 
				+    source_map: Dict[str, str],
			
 
				+    threshold_abs: float = 0.65,
			
 
				+    margin: float = 0.15,
			
 
				+    allow_multiple: bool = False,
			
 
				+    sem_weight: float = 0.8,
			
 
				+    lex_weight: float = 0.2,
			
 
				+    extracted_attrs: Optional[Dict[str, List[Dict[str, str]]]] = None,
			
 
				+    relationships: Optional[Dict[str, float]] = None,
			
 
				+    use_dynamic_thresholds: bool = True,
			
 
				+    use_adaptive_margin: bool = True,
			
 
				+    use_semantic_clustering: bool = True
			
 
				+) -> dict:
			
 
				         """
			
 
				         Score each allowed value against the product_text with dynamic thresholds.
			
 
				         Returns dict with values in array format: [{"value": "...", "source": "..."}]
			
@@ -3251,26 +3263,42 @@ If an attribute is not present, do not include it in the response.
 
				         for attr, allowed_values in mandatory_attrs.items():
			
 
				             scores: List[Tuple[str, float]] = []
			
 
				             
			
 
				-            # Normalize product text for dimension matching if needed
			
 
				-            normalized_product_text = product_text
			
 
				+            # Check if this is a dimension attribute
			
 
				             is_dimension_attr = any(keyword in attr.lower() for keyword in ['dimension', 'size', 'measurement'])
			
 
				+            
			
 
				+            # Normalize product text once for dimension matching
			
 
				+            normalized_product_text = ProductAttributeService.normalize_dimension_text(product_text) if is_dimension_attr else ""
			
 
				 
			
 
				             for val in allowed_values:
			
 
				-                # For dimension attributes, try normalized matching first
			
 
				+                # For dimension attributes, try exact normalized matching first
			
 
				                 if is_dimension_attr:
			
 
				+                    # Normalize the allowed value from the list
			
 
				                     normalized_val = ProductAttributeService.normalize_dimension_text(val)
			
 
				-                    normalized_pt = ProductAttributeService.normalize_dimension_text(product_text)
			
 
				                     
			
 
				-                    # If we find exact normalized match, give it highest score
			
 
				-                    if normalized_val and normalized_pt and normalized_val == normalized_pt:
			
 
				-                        scores.append((val, 0.99))
			
 
				-                        continue
			
 
				-                    # Also check if dimension appears anywhere in the text
			
 
				-                    elif normalized_val and normalized_val in normalized_pt:
			
 
				-                        scores.append((val, 0.95))
			
 
				+                    # If we have both normalized values and they match exactly, give highest score
			
 
				+                    if normalized_val and normalized_product_text and normalized_val == normalized_product_text:
			
 
				+                        scores.append((val, 1.0))
			
 
				                         continue
			
 
				+                    
			
 
				+                    # Also check if the normalized value appears in the original product text
			
 
				+                    # This handles cases where the format might be slightly different
			
 
				+                    if normalized_val:
			
 
				+                        # Extract just the numbers for flexible matching
			
 
				+                        val_numbers = normalized_val.split('x')
			
 
				+                        # Check if both numbers appear in the product text in close proximity
			
 
				+                        text_lower = product_text.lower()
			
 
				+                        if all(num in text_lower for num in val_numbers):
			
 
				+                            # Calculate proximity score
			
 
				+                            idx1 = text_lower.find(val_numbers[0])
			
 
				+                            idx2 = text_lower.find(val_numbers[1])
			
 
				+                            if idx1 != -1 and idx2 != -1:
			
 
				+                                distance = abs(idx2 - idx1)
			
 
				+                                # If numbers are close together (within 20 characters), high score
			
 
				+                                if distance < 20:
			
 
				+                                    scores.append((val, 0.95))
			
 
				+                                    continue
			
 
				                 
			
 
				-                # Standard semantic matching
			
 
				+                # Standard semantic matching for all attributes
			
 
				                 contexts = [val, f"for {val}", f"use in {val}", f"suitable for {val}", f"{val} room"]
			
 
				                 ctx_embs = [model_embedder.encode(c, convert_to_tensor=True) for c in contexts]
			
 
				                 sem_sim = max(float(util.cos_sim(pt_emb, ce).item()) for ce in ctx_embs)
			
@@ -3311,8 +3339,7 @@ If an attribute is not present, do not include it in the response.
 
				                     best_cluster = next((c for c in clusters if best_val in c), [best_val])
			
 
				 
			
 
				                 for val, sc in scores[1:]:
			
 
				-                    # Skip values with very low scores - they're likely not relevant
			
 
				-                    # For dimension attributes, be even more strict
			
 
				+                    # Skip values with very low scores
			
 
				                     min_score = 0.4 if is_dimension_attr else 0.3
			
 
				                     if sc < min_score:
			
 
				                         continue
			
@@ -3356,6 +3383,7 @@ If an attribute is not present, do not include it in the response.
 
				 
			
 
				         return extracted
			
 
				 
			
 
				+
			
 
				     @staticmethod
			
 
				     def extract_attributes(
			
 
				         product_text: str,
			
--- a/db.sqlite3
+++ b/db.sqlite3