|
@@ -2807,22 +2807,34 @@ class ProductAttributeService:
|
|
|
if len(float_numbers) < 2:
|
|
if len(float_numbers) < 2:
|
|
|
return ""
|
|
return ""
|
|
|
|
|
|
|
|
- # If we have 3 dimensions, remove the smallest one (usually thickness/depth)
|
|
|
|
|
- # This handles cases like "16 x 1.5 x 20" -> keep 16 and 20, drop 1.5
|
|
|
|
|
- if len(float_numbers) >= 3:
|
|
|
|
|
- # Sort and remove the smallest
|
|
|
|
|
- float_numbers_sorted = sorted(float_numbers, reverse=True)
|
|
|
|
|
- float_numbers = float_numbers_sorted[:2]
|
|
|
|
|
-
|
|
|
|
|
- # Convert to integers for dimension matching
|
|
|
|
|
- int_numbers = [int(num) for num in float_numbers]
|
|
|
|
|
|
|
+ # If we have 3 dimensions, it's likely Width x Depth x Height
|
|
|
|
|
+ # For wall art, depth is usually small (< 5), so we keep first and last
|
|
|
|
|
+ if len(float_numbers) == 3:
|
|
|
|
|
+ # Keep first and last values (width and height), skip middle (depth)
|
|
|
|
|
+ float_numbers = [float_numbers[0], float_numbers[2]]
|
|
|
|
|
+ elif len(float_numbers) > 3:
|
|
|
|
|
+ # If more than 3 dimensions, keep the two largest
|
|
|
|
|
+ float_numbers = sorted(float_numbers)[-2:]
|
|
|
|
|
+ else:
|
|
|
|
|
+ # Just 2 dimensions, use as is
|
|
|
|
|
+ float_numbers = float_numbers[:2]
|
|
|
|
|
+
|
|
|
|
|
+ # Format numbers: use integer if whole, else one decimal
|
|
|
|
|
+ formatted_numbers = []
|
|
|
|
|
+ for num in float_numbers:
|
|
|
|
|
+ if num.is_integer():
|
|
|
|
|
+ formatted_numbers.append(str(int(num)))
|
|
|
|
|
+ else:
|
|
|
|
|
+ formatted_numbers.append(f"{num:.1f}")
|
|
|
|
|
|
|
|
# Sort to ensure consistent order (smaller x larger)
|
|
# Sort to ensure consistent order (smaller x larger)
|
|
|
- int_numbers.sort()
|
|
|
|
|
|
|
+ formatted_numbers.sort(key=lambda x: float(x))
|
|
|
|
|
|
|
|
# Return formatted dimension
|
|
# Return formatted dimension
|
|
|
- return f"{int_numbers[0]}x{int_numbers[1]}"
|
|
|
|
|
|
|
+ return f"{formatted_numbers[0]}x{formatted_numbers[1]}"
|
|
|
|
|
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
@staticmethod
|
|
@staticmethod
|
|
|
def normalize_value_for_matching(value: str, attr_name: str = "") -> str:
|
|
def normalize_value_for_matching(value: str, attr_name: str = "") -> str:
|
|
|
"""
|
|
"""
|
|
@@ -3222,20 +3234,20 @@ If an attribute is not present, do not include it in the response.
|
|
|
|
|
|
|
|
@staticmethod
|
|
@staticmethod
|
|
|
def normalize_against_product_text(
|
|
def normalize_against_product_text(
|
|
|
- product_text: str,
|
|
|
|
|
- mandatory_attrs: Dict[str, List[str]],
|
|
|
|
|
- source_map: Dict[str, str],
|
|
|
|
|
- threshold_abs: float = 0.65,
|
|
|
|
|
- margin: float = 0.15,
|
|
|
|
|
- allow_multiple: bool = False,
|
|
|
|
|
- sem_weight: float = 0.8,
|
|
|
|
|
- lex_weight: float = 0.2,
|
|
|
|
|
- extracted_attrs: Optional[Dict[str, List[Dict[str, str]]]] = None,
|
|
|
|
|
- relationships: Optional[Dict[str, float]] = None,
|
|
|
|
|
- use_dynamic_thresholds: bool = True,
|
|
|
|
|
- use_adaptive_margin: bool = True,
|
|
|
|
|
- use_semantic_clustering: bool = True
|
|
|
|
|
- ) -> dict:
|
|
|
|
|
|
|
+ product_text: str,
|
|
|
|
|
+ mandatory_attrs: Dict[str, List[str]],
|
|
|
|
|
+ source_map: Dict[str, str],
|
|
|
|
|
+ threshold_abs: float = 0.65,
|
|
|
|
|
+ margin: float = 0.15,
|
|
|
|
|
+ allow_multiple: bool = False,
|
|
|
|
|
+ sem_weight: float = 0.8,
|
|
|
|
|
+ lex_weight: float = 0.2,
|
|
|
|
|
+ extracted_attrs: Optional[Dict[str, List[Dict[str, str]]]] = None,
|
|
|
|
|
+ relationships: Optional[Dict[str, float]] = None,
|
|
|
|
|
+ use_dynamic_thresholds: bool = True,
|
|
|
|
|
+ use_adaptive_margin: bool = True,
|
|
|
|
|
+ use_semantic_clustering: bool = True
|
|
|
|
|
+) -> dict:
|
|
|
"""
|
|
"""
|
|
|
Score each allowed value against the product_text with dynamic thresholds.
|
|
Score each allowed value against the product_text with dynamic thresholds.
|
|
|
Returns dict with values in array format: [{"value": "...", "source": "..."}]
|
|
Returns dict with values in array format: [{"value": "...", "source": "..."}]
|
|
@@ -3251,26 +3263,42 @@ If an attribute is not present, do not include it in the response.
|
|
|
for attr, allowed_values in mandatory_attrs.items():
|
|
for attr, allowed_values in mandatory_attrs.items():
|
|
|
scores: List[Tuple[str, float]] = []
|
|
scores: List[Tuple[str, float]] = []
|
|
|
|
|
|
|
|
- # Normalize product text for dimension matching if needed
|
|
|
|
|
- normalized_product_text = product_text
|
|
|
|
|
|
|
+ # Check if this is a dimension attribute
|
|
|
is_dimension_attr = any(keyword in attr.lower() for keyword in ['dimension', 'size', 'measurement'])
|
|
is_dimension_attr = any(keyword in attr.lower() for keyword in ['dimension', 'size', 'measurement'])
|
|
|
|
|
+
|
|
|
|
|
+ # Normalize product text once for dimension matching
|
|
|
|
|
+ normalized_product_text = ProductAttributeService.normalize_dimension_text(product_text) if is_dimension_attr else ""
|
|
|
|
|
|
|
|
for val in allowed_values:
|
|
for val in allowed_values:
|
|
|
- # For dimension attributes, try normalized matching first
|
|
|
|
|
|
|
+ # For dimension attributes, try exact normalized matching first
|
|
|
if is_dimension_attr:
|
|
if is_dimension_attr:
|
|
|
|
|
+ # Normalize the allowed value from the list
|
|
|
normalized_val = ProductAttributeService.normalize_dimension_text(val)
|
|
normalized_val = ProductAttributeService.normalize_dimension_text(val)
|
|
|
- normalized_pt = ProductAttributeService.normalize_dimension_text(product_text)
|
|
|
|
|
|
|
|
|
|
- # If we find exact normalized match, give it highest score
|
|
|
|
|
- if normalized_val and normalized_pt and normalized_val == normalized_pt:
|
|
|
|
|
- scores.append((val, 0.99))
|
|
|
|
|
- continue
|
|
|
|
|
- # Also check if dimension appears anywhere in the text
|
|
|
|
|
- elif normalized_val and normalized_val in normalized_pt:
|
|
|
|
|
- scores.append((val, 0.95))
|
|
|
|
|
|
|
+ # If we have both normalized values and they match exactly, give highest score
|
|
|
|
|
+ if normalized_val and normalized_product_text and normalized_val == normalized_product_text:
|
|
|
|
|
+ scores.append((val, 1.0))
|
|
|
continue
|
|
continue
|
|
|
|
|
+
|
|
|
|
|
+ # Also check if the normalized value appears in the original product text
|
|
|
|
|
+ # This handles cases where the format might be slightly different
|
|
|
|
|
+ if normalized_val:
|
|
|
|
|
+ # Extract just the numbers for flexible matching
|
|
|
|
|
+ val_numbers = normalized_val.split('x')
|
|
|
|
|
+ # Check if both numbers appear in the product text in close proximity
|
|
|
|
|
+ text_lower = product_text.lower()
|
|
|
|
|
+ if all(num in text_lower for num in val_numbers):
|
|
|
|
|
+ # Calculate proximity score
|
|
|
|
|
+ idx1 = text_lower.find(val_numbers[0])
|
|
|
|
|
+ idx2 = text_lower.find(val_numbers[1])
|
|
|
|
|
+ if idx1 != -1 and idx2 != -1:
|
|
|
|
|
+ distance = abs(idx2 - idx1)
|
|
|
|
|
+ # If numbers are close together (within 20 characters), high score
|
|
|
|
|
+ if distance < 20:
|
|
|
|
|
+ scores.append((val, 0.95))
|
|
|
|
|
+ continue
|
|
|
|
|
|
|
|
- # Standard semantic matching
|
|
|
|
|
|
|
+ # Standard semantic matching for all attributes
|
|
|
contexts = [val, f"for {val}", f"use in {val}", f"suitable for {val}", f"{val} room"]
|
|
contexts = [val, f"for {val}", f"use in {val}", f"suitable for {val}", f"{val} room"]
|
|
|
ctx_embs = [model_embedder.encode(c, convert_to_tensor=True) for c in contexts]
|
|
ctx_embs = [model_embedder.encode(c, convert_to_tensor=True) for c in contexts]
|
|
|
sem_sim = max(float(util.cos_sim(pt_emb, ce).item()) for ce in ctx_embs)
|
|
sem_sim = max(float(util.cos_sim(pt_emb, ce).item()) for ce in ctx_embs)
|
|
@@ -3311,8 +3339,7 @@ If an attribute is not present, do not include it in the response.
|
|
|
best_cluster = next((c for c in clusters if best_val in c), [best_val])
|
|
best_cluster = next((c for c in clusters if best_val in c), [best_val])
|
|
|
|
|
|
|
|
for val, sc in scores[1:]:
|
|
for val, sc in scores[1:]:
|
|
|
- # Skip values with very low scores - they're likely not relevant
|
|
|
|
|
- # For dimension attributes, be even more strict
|
|
|
|
|
|
|
+ # Skip values with very low scores
|
|
|
min_score = 0.4 if is_dimension_attr else 0.3
|
|
min_score = 0.4 if is_dimension_attr else 0.3
|
|
|
if sc < min_score:
|
|
if sc < min_score:
|
|
|
continue
|
|
continue
|
|
@@ -3356,6 +3383,7 @@ If an attribute is not present, do not include it in the response.
|
|
|
|
|
|
|
|
return extracted
|
|
return extracted
|
|
|
|
|
|
|
|
|
|
+
|
|
|
@staticmethod
|
|
@staticmethod
|
|
|
def extract_attributes(
|
|
def extract_attributes(
|
|
|
product_text: str,
|
|
product_text: str,
|