harshit.pathak
/
content_quality_tool


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882
							# ==================== services.py ====================
import requests
import json
from typing import Dict, List, Optional, Tuple
from django.conf import settings
from concurrent.futures import ThreadPoolExecutor, as_completed
from sentence_transformers import SentenceTransformer, util
import numpy as np
from .ocr_service import OCRService


# Initialize embedding model for normalization
model_embedder = SentenceTransformer("all-MiniLM-L6-v2")


class ProductAttributeService:
    """Service class for extracting product attributes using Groq LLM."""

    @staticmethod
    def combine_product_text(
        title: Optional[str] = None,
        short_desc: Optional[str] = None,
        long_desc: Optional[str] = None,
        ocr_text: Optional[str] = None
    ) -> Tuple[str, Dict[str, str]]:
        """
        Combine product metadata into a single text block.
        Returns: (combined_text, source_map) where source_map tracks which text came from where
        """
        parts = []
        source_map = {}
        
        if title:
            title_str = str(title).strip()
            parts.append(f"Title: {title_str}")
            source_map['title'] = title_str
        if short_desc:
            short_str = str(short_desc).strip()
            parts.append(f"Description: {short_str}")
            source_map['short_desc'] = short_str
        if long_desc:
            long_str = str(long_desc).strip()
            parts.append(f"Details: {long_str}")
            source_map['long_desc'] = long_str
        if ocr_text:
            parts.append(f"OCR Text: {ocr_text}")
            source_map['ocr_text'] = ocr_text
        
        combined = "\n".join(parts).strip()
        
        if not combined:
            return "No product information available", {}
        
        return combined, source_map

    @staticmethod
    def find_value_source(value: str, source_map: Dict[str, str]) -> str:
        """
        Find which source(s) contain the given value.
        Returns the source name(s) where the value appears.
        """
        value_lower = value.lower()
        # Split value into tokens for better matching
        value_tokens = set(value_lower.replace("-", " ").split())
        
        sources_found = []
        source_scores = {}
        
        for source_name, source_text in source_map.items():
            source_lower = source_text.lower()
            
            # Check for exact phrase match first
            if value_lower in source_lower:
                source_scores[source_name] = 1.0
                continue
            
            # Check for token matches
            token_matches = sum(1 for token in value_tokens if token in source_lower)
            if token_matches > 0:
                source_scores[source_name] = token_matches / len(value_tokens)
        
        # Return source with highest score, or all sources if multiple have same score
        if source_scores:
            max_score = max(source_scores.values())
            sources_found = [s for s, score in source_scores.items() if score == max_score]
            
            # Prioritize: title > short_desc > long_desc > ocr_text
            priority = ['title', 'short_desc', 'long_desc', 'ocr_text']
            for p in priority:
                if p in sources_found:
                    return p
            
            return sources_found[0] if sources_found else "Not found"
        
        return "Not found"

    @staticmethod
    def extract_attributes_from_ocr(ocr_results: Dict, model: str = None) -> Dict:
        """Extract structured attributes from OCR text using LLM."""
        if model is None:
            model = settings.SUPPORTED_MODELS[0]
        
        detected_text = ocr_results.get('detected_text', [])
        if not detected_text:
            return {}
        
        # Format OCR text for prompt
        ocr_text = "\n".join([f"Text: {item['text']}, Confidence: {item['confidence']:.2f}" 
                              for item in detected_text])
        
        prompt = f"""
You are an AI model that extracts structured attributes from OCR text detected on product images.
Given the OCR detections below, infer the possible product attributes and return them as a clean JSON object.

OCR Text:
{ocr_text}

Extract relevant attributes like:
- brand
- model_number
- size (waist_size, length, etc.)
- collection
- any other relevant product information

Return a JSON object with only the attributes you can confidently identify.
If an attribute is not present, do not include it in the response.
"""
        
        payload = {
            "model": model,
            "messages": [
                {
                    "role": "system",
                    "content": "You are a helpful AI that extracts structured data from OCR output. Return only valid JSON."
                },
                {"role": "user", "content": prompt}
            ],
            "temperature": 0.2,
            "max_tokens": 500
        }
        
        headers = {
            "Authorization": f"Bearer {settings.GROQ_API_KEY}",
            "Content-Type": "application/json",
        }
        
        try:
            response = requests.post(
                settings.GROQ_API_URL,
                headers=headers,
                json=payload,
                timeout=30
            )
            response.raise_for_status()
            result_text = response.json()["choices"][0]["message"]["content"].strip()
            
            # Clean and parse JSON
            result_text = ProductAttributeService._clean_json_response(result_text)
            parsed = json.loads(result_text)
            
            return parsed
        except Exception as e:
            return {"error": f"Failed to extract attributes from OCR: {str(e)}"}

    @staticmethod
    def calculate_attribute_relationships(
        mandatory_attrs: Dict[str, List[str]],
        product_text: str
    ) -> Dict[str, float]:
        """
        Calculate semantic relationships between attribute values across different attributes.
        Returns a matrix of cross-attribute value similarities.
        """
        pt_emb = model_embedder.encode(product_text, convert_to_tensor=True)

        # Calculate similarities between all attribute values and product text
        attr_scores = {}
        for attr, values in mandatory_attrs.items():
            attr_scores[attr] = {}
            for val in values:
                contexts = [val, f"for {val}", f"use in {val}", f"suitable for {val}"]
                ctx_embs = [model_embedder.encode(c, convert_to_tensor=True) for c in contexts]
                sem_sim = max(float(util.cos_sim(pt_emb, ce).item()) for ce in ctx_embs)
                attr_scores[attr][val] = sem_sim

        # Calculate cross-attribute value relationships
        relationships = {}
        attr_list = list(mandatory_attrs.keys())

        for i, attr1 in enumerate(attr_list):
            for attr2 in attr_list[i+1:]:
                # Calculate pairwise similarities between values of different attributes
                for val1 in mandatory_attrs[attr1]:
                    for val2 in mandatory_attrs[attr2]:
                        emb1 = model_embedder.encode(val1, convert_to_tensor=True)
                        emb2 = model_embedder.encode(val2, convert_to_tensor=True)
                        sim = float(util.cos_sim(emb1, emb2).item())

                        # Store bidirectional relationships
                        key1 = f"{attr1}:{val1}->{attr2}:{val2}"
                        key2 = f"{attr2}:{val2}->{attr1}:{val1}"
                        relationships[key1] = sim
                        relationships[key2] = sim

        return relationships

    @staticmethod
    def calculate_value_clusters(
        values: List[str],
        scores: List[Tuple[str, float]],
        cluster_threshold: float = 0.4
    ) -> List[List[str]]:
        """
        Group values into semantic clusters based on their similarity to each other.
        Returns clusters of related values.
        """
        if len(values) <= 1:
            return [[val] for val, _ in scores]

        # Get embeddings for all values
        embeddings = [model_embedder.encode(val, convert_to_tensor=True) for val in values]

        # Calculate pairwise similarities
        similarity_matrix = np.zeros((len(values), len(values)))
        for i in range(len(values)):
            for j in range(i+1, len(values)):
                sim = float(util.cos_sim(embeddings[i], embeddings[j]).item())
                similarity_matrix[i][j] = sim
                similarity_matrix[j][i] = sim

        # Simple clustering: group values with high similarity
        clusters = []
        visited = set()

        for i, (val, score) in enumerate(scores):
            if i in visited:
                continue

            cluster = [val]
            visited.add(i)

            # Find similar values
            for j in range(len(values)):
                if j not in visited and similarity_matrix[i][j] >= cluster_threshold:
                    cluster.append(values[j])
                    visited.add(j)

            clusters.append(cluster)

        return clusters

    @staticmethod
    def get_dynamic_threshold(
        attr: str,
        val: str,
        base_score: float,
        extracted_attrs: Dict[str, List[Dict[str, str]]],
        relationships: Dict[str, float],
        mandatory_attrs: Dict[str, List[str]],
        base_threshold: float = 0.65,
        boost_factor: float = 0.15
    ) -> float:
        """
        Calculate dynamic threshold based on relationships with already-extracted attributes.
        """
        threshold = base_threshold

        # Check relationships with already extracted attributes
        max_relationship = 0.0
        for other_attr, other_values_list in extracted_attrs.items():
            if other_attr == attr:
                continue

            for other_val_dict in other_values_list:
                other_val = other_val_dict['value']
                key = f"{attr}:{val}->{other_attr}:{other_val}"
                if key in relationships:
                    max_relationship = max(max_relationship, relationships[key])

        # If strong relationship exists, lower threshold
        if max_relationship > 0.6:
            threshold = base_threshold - (boost_factor * max_relationship)

        return max(0.3, threshold)

    @staticmethod
    def get_adaptive_margin(
        scores: List[Tuple[str, float]],
        base_margin: float = 0.15,
        max_margin: float = 0.22
    ) -> float:
        """
        Calculate adaptive margin based on score distribution.
        """
        if len(scores) < 2:
            return base_margin

        score_values = [s for _, s in scores]
        best_score = score_values[0]

        # If best score is very low, use adaptive margin but be more conservative
        if best_score < 0.5:
            # Calculate score spread in top 3-4 scores only (more selective)
            top_scores = score_values[:min(4, len(score_values))]
            score_range = max(top_scores) - min(top_scores)

            # Very controlled margin increase
            if score_range < 0.30:
                # Much more conservative scaling
                score_factor = (0.5 - best_score) * 0.35
                adaptive = base_margin + score_factor + (0.30 - score_range) * 0.2
                return min(adaptive, max_margin)

        return base_margin

    @staticmethod
    def _lexical_evidence(product_text: str, label: str) -> float:
        """Calculate lexical overlap between product text and label."""
        pt = product_text.lower()
        tokens = [t for t in label.lower().replace("-", " ").split() if t]
        if not tokens:
            return 0.0
        hits = sum(1 for t in tokens if t in pt)
        return hits / len(tokens)

    @staticmethod
    def normalize_against_product_text(
        product_text: str,
        mandatory_attrs: Dict[str, List[str]],
        source_map: Dict[str, str],
        threshold_abs: float = 0.65,
        margin: float = 0.15,
        allow_multiple: bool = False,
        sem_weight: float = 0.8,
        lex_weight: float = 0.2,
        extracted_attrs: Optional[Dict[str, List[Dict[str, str]]]] = None,
        relationships: Optional[Dict[str, float]] = None,
        use_dynamic_thresholds: bool = True,
        use_adaptive_margin: bool = True,
        use_semantic_clustering: bool = True
    ) -> dict:
        """
        Score each allowed value against the product_text with dynamic thresholds.
        Returns dict with values in array format: [{"value": "...", "source": "..."}]
        """
        if extracted_attrs is None:
            extracted_attrs = {}
        if relationships is None:
            relationships = {}

        pt_emb = model_embedder.encode(product_text, convert_to_tensor=True)
        extracted = {}

        for attr, allowed_values in mandatory_attrs.items():
            scores: List[Tuple[str, float]] = []

            for val in allowed_values:
                contexts = [val, f"for {val}", f"use in {val}", f"suitable for {val}", f"{val} room"]
                ctx_embs = [model_embedder.encode(c, convert_to_tensor=True) for c in contexts]
                sem_sim = max(float(util.cos_sim(pt_emb, ce).item()) for ce in ctx_embs)

                lex_score = ProductAttributeService._lexical_evidence(product_text, val)
                final_score = sem_weight * sem_sim + lex_weight * lex_score
                scores.append((val, final_score))

            scores.sort(key=lambda x: x[1], reverse=True)
            best_val, best_score = scores[0]

            # DEBUG: Print scores
            print(f"\n{'='*80}")
            print(f"Attribute: {attr}")
            print(f"{'='*80}")
            print(f"Top 5 Scores:")
            for i, (val, sc) in enumerate(scores[:5]):
                print(f"  {i+1}. {val}: {sc:.4f}")
            print(f"\nBest: {best_val} (score: {best_score:.4f})")
            print(f"Base Threshold: {threshold_abs}")
            print(f"Base Margin: {margin}")

            # Calculate adaptive margin if enabled
            effective_margin = margin
            if allow_multiple and use_adaptive_margin:
                effective_margin = ProductAttributeService.get_adaptive_margin(scores, margin)
                print(f"Adaptive Margin: {effective_margin}")

            if not allow_multiple:
                source = ProductAttributeService.find_value_source(best_val, source_map)
                extracted[attr] = [{"value": best_val, "source": source}]
                print(f"Single value mode - Selected: {best_val}")
            else:
                print(f"\nMultiple value mode enabled")
                candidates = [best_val]
                use_base_threshold = best_score >= threshold_abs
                print(f"Use base threshold: {use_base_threshold} (best_score >= {threshold_abs})")

                # Get semantic clusters if enabled
                clusters = []
                if use_semantic_clustering:
                    clusters = ProductAttributeService.calculate_value_clusters(
                        allowed_values, scores, cluster_threshold=0.4
                    )
                    best_cluster = next((c for c in clusters if best_val in c), [best_val])
                    print(f"\nSemantic Clusters:")
                    for idx, cluster in enumerate(clusters):
                        marker = " <- BEST" if best_val in cluster else ""
                        print(f"  Cluster {idx+1}: {cluster}{marker}")

                print(f"\nEvaluating additional candidates:")
                for val, sc in scores[1:]:
                    # Calculate dynamic threshold for this value
                    if use_dynamic_thresholds and extracted_attrs:
                        dynamic_thresh = ProductAttributeService.get_dynamic_threshold(
                            attr, val, sc, extracted_attrs, relationships,
                            mandatory_attrs, threshold_abs
                        )
                    else:
                        dynamic_thresh = threshold_abs

                    within_margin = (best_score - sc) <= effective_margin
                    above_threshold = sc >= dynamic_thresh

                    # Check if in same semantic cluster as best value
                    in_cluster = False
                    if use_semantic_clustering and clusters:
                        in_cluster = any(best_val in c and val in c for c in clusters)

                    # DEBUG: Print candidate evaluation
                    print(f"\n  Candidate: {val}")
                    print(f"    Score: {sc:.4f}")
                    print(f"    Margin diff: {best_score - sc:.4f} (within_margin: {within_margin})")
                    print(f"    Dynamic threshold: {dynamic_thresh:.4f} (above_threshold: {above_threshold})")
                    print(f"    In cluster with best: {in_cluster}")

                    # MODIFIED LOGIC: More permissive for multi-value extraction
                    # BALANCED LOGIC: Smart multi-value extraction
                    include_candidate = False
                    reason = ""

                    # Calculate score ratio (how close to best score)
                    score_ratio = sc / best_score if best_score > 0 else 0

                    if use_base_threshold:
                        # Best score is good (>= threshold), be selective
                        if above_threshold and within_margin:
                            include_candidate = True
                            reason = "above threshold AND within margin"
                        elif in_cluster and within_margin and score_ratio >= 0.75:
                            # Only include cluster members if they're close in score
                            include_candidate = True
                            reason = "in cluster AND within margin with good score ratio"
                    else:
                        # Best score is low (< threshold), be more careful
                        # Only include candidates that are very close to the best score
                        if within_margin and score_ratio >= 0.80:
                            # Must be at least 80% of best score
                            include_candidate = True
                            reason = "within margin with strong score ratio"
                        elif in_cluster and within_margin and score_ratio >= 0.85:
                            # Cluster members need even higher ratio when best score is low
                            include_candidate = True
                            reason = "in cluster with tight margin and high score ratio"

                    # Additional filter: Never include "Not Specified" if we have better options
                    if include_candidate and val.lower() in ["not specified", "not_specified", "unspecified"]:
                        # Only include "Not Specified" if it's the best value AND no other candidates
                        if len(candidates) > 1 or (sc < best_score * 0.95):
                            include_candidate = False
                            reason = "excluded: 'Not Specified' with better alternatives"

                    if include_candidate:
                        candidates.append(val)
                        print(f"    ✓ INCLUDED - Reason: {reason}")
                    else:
                        print(f"    ✗ EXCLUDED")

                # Map each candidate to its source and create array format
                extracted[attr] = []
                print(f"\nFinal candidates for {attr}: {candidates}")
                for candidate in candidates:
                    source = ProductAttributeService.find_value_source(candidate, source_map)
                    extracted[attr].append({"value": candidate, "source": source})
                    print(f"  - {candidate} (source: {source})")

            print(f"{'='*80}\n")

        return extracted


    @staticmethod
    def extract_attributes(
        product_text: str,
        mandatory_attrs: Dict[str, List[str]],
        source_map: Dict[str, str] = None,
        model: str = None,
        extract_additional: bool = True,
        multiple: Optional[List[str]] = None,
        threshold_abs: float = 0.65,
        margin: float = 0.15,
        use_dynamic_thresholds: bool = True,
        use_adaptive_margin: bool = True,
        use_semantic_clustering: bool = True
    ) -> dict:
        """
        Use Groq LLM to extract attributes from any product type with enhanced multi-value selection.
        Now returns values in array format: [{"value": "...", "source": "..."}]
        """
        
        if model is None:
            model = settings.SUPPORTED_MODELS[0]

        if multiple is None:
            multiple = []

        if source_map is None:
            source_map = {}

        # DEBUG: Print what we received
        print("\n" + "="*80)
        print("EXTRACT ATTRIBUTES - INPUT PARAMETERS")
        print("="*80)
        print(f"Product text length: {len(product_text)}")
        print(f"Mandatory attrs: {list(mandatory_attrs.keys())}")
        print(f"Multiple mode for: {multiple}")
        print(f"Threshold: {threshold_abs}, Margin: {margin}")
        print(f"Dynamic thresholds: {use_dynamic_thresholds}")
        print(f"Adaptive margin: {use_adaptive_margin}")
        print(f"Semantic clustering: {use_semantic_clustering}")
        print("="*80 + "\n")

        # Check if product text is empty or minimal
        if not product_text or product_text == "No product information available":
            return ProductAttributeService._create_error_response(
                "No product information provided",
                mandatory_attrs,
                extract_additional
            )

        # Create structured prompt for mandatory attributes
        mandatory_attr_list = []
        for attr_name, allowed_values in mandatory_attrs.items():
            mandatory_attr_list.append(f"{attr_name}: {', '.join(allowed_values)}")
        mandatory_attr_text = "\n".join(mandatory_attr_list)

        additional_instruction = ""
        if extract_additional:
            additional_instruction = """
    2. Extract ADDITIONAL attributes: Identify any other relevant attributes from the product text 
    that are NOT in the mandatory list. Only include attributes where you can find actual values
    in the product text. Do NOT include attributes with "Not Specified" or empty values.
    
    Examples of attributes to look for (only if present): Brand, Material, Size, Color, Dimensions,
    Weight, Features, Style, Theme, Pattern, Finish, Care Instructions, etc."""

        output_format = {
            "mandatory": {attr: "value or list of values" for attr in mandatory_attrs.keys()},
        }

        if extract_additional:
            output_format["additional"] = {
                "example_attribute_1": "actual value found",
                "example_attribute_2": "actual value found"
            }
            output_format["additional"]["_note"] = "Only include attributes with actual values found in text"

        prompt = f"""
    You are an intelligent product attribute extractor that works with ANY product type.

    TASK:
    1. Extract MANDATORY attributes: For each mandatory attribute, select the most appropriate value(s)
    from the provided list. Choose the value(s) that best match the product description.
    {additional_instruction}

    Product Text:
    {product_text}

    Mandatory Attribute Lists (MUST select from these allowed values):
    {mandatory_attr_text}

    CRITICAL INSTRUCTIONS:
    - Return ONLY valid JSON, nothing else
    - No explanations, no markdown, no text before or after the JSON
    - For mandatory attributes, choose the value(s) from the provided list that best match
    - If a mandatory attribute cannot be determined from the product text, use "Not Specified"
    - Prefer exact matches from the allowed values list over generic synonyms
    - If multiple values are plausible, you MAY return more than one
    {f"- For additional attributes: ONLY include attributes where you found actual values in the product text. DO NOT include attributes with 'Not Specified', 'None', 'N/A', or empty values. If you cannot find a value for an attribute, simply don't include that attribute." if extract_additional else ""}
    - Be precise and only extract information that is explicitly stated or clearly implied

    Required Output Format:
    {json.dumps(output_format, indent=2)}
            """

        payload = {
            "model": model,
            "messages": [
                {
                    "role": "system",
                    "content": f"You are a precise attribute extraction model. Return ONLY valid JSON with {'mandatory and additional' if extract_additional else 'mandatory'} sections. No explanations, no markdown, no other text."
                },
                {"role": "user", "content": prompt}
            ],
            "temperature": 0.0,
            "max_tokens": 1500
        }

        headers = {
            "Authorization": f"Bearer {settings.GROQ_API_KEY}",
            "Content-Type": "application/json",
        }

        try:
            response = requests.post(
                settings.GROQ_API_URL,
                headers=headers,
                json=payload,
                timeout=30
            )
            response.raise_for_status()
            result_text = response.json()["choices"][0]["message"]["content"].strip()

            # Clean the response
            result_text = ProductAttributeService._clean_json_response(result_text)

            # Parse JSON
            parsed = json.loads(result_text)

            # Validate and restructure if needed
            parsed = ProductAttributeService._validate_response_structure(
                parsed, mandatory_attrs, extract_additional
            )

            # Clean up and add source tracking to additional attributes in array format
            if extract_additional and "additional" in parsed:
                cleaned_additional = {}
                for k, v in parsed["additional"].items():
                    if v and v not in ["Not Specified", "None", "N/A", "", "not specified", "none", "n/a"]:
                        if not (isinstance(v, str) and v.lower() in ["not specified", "none", "n/a", ""]):
                            source = ProductAttributeService.find_value_source(str(v), source_map)
                            cleaned_additional[k] = [{"value": str(v), "source": source}]
                parsed["additional"] = cleaned_additional

            # Calculate attribute relationships if using dynamic thresholds
            relationships = {}
            if use_dynamic_thresholds:
                relationships = ProductAttributeService.calculate_attribute_relationships(
                    mandatory_attrs, product_text
                )

            # Process attributes in order, allowing earlier ones to influence later ones
            extracted_so_far = {}
            for attr in mandatory_attrs.keys():
                allow_multiple = attr in multiple
                
                # DEBUG: Print per-attribute processing
                print(f"\n>>> Processing attribute: {attr}")
                print(f"    Allow multiple: {allow_multiple}")
                print(f"    In multiple list: {attr in multiple}")
                print(f"    Multiple list: {multiple}")

                result = ProductAttributeService.normalize_against_product_text(
                    product_text=product_text,
                    mandatory_attrs={attr: mandatory_attrs[attr]},
                    source_map=source_map,
                    threshold_abs=threshold_abs,
                    margin=margin,
                    allow_multiple=allow_multiple,
                    extracted_attrs=extracted_so_far,
                    relationships=relationships,
                    use_dynamic_thresholds=use_dynamic_thresholds,
                    use_adaptive_margin=use_adaptive_margin,
                    use_semantic_clustering=use_semantic_clustering
                )

                parsed["mandatory"][attr] = result[attr]
                extracted_so_far[attr] = result[attr]

            return parsed

        except requests.exceptions.RequestException as e:
            return ProductAttributeService._create_error_response(
                str(e), mandatory_attrs, extract_additional
            )
        except json.JSONDecodeError as e:
            return ProductAttributeService._create_error_response(
                f"Invalid JSON: {str(e)}", mandatory_attrs, extract_additional, result_text
            )
        except Exception as e:
            return ProductAttributeService._create_error_response(
                str(e), mandatory_attrs, extract_additional
            )


    @staticmethod
    def extract_attributes_batch(
        products: List[Dict],
        mandatory_attrs: Dict[str, List[str]],
        model: str = None,
        extract_additional: bool = True,
        process_image: bool = True,
        max_workers: int = 5,
        multiple: Optional[List[str]] = None,
        threshold_abs: float = 0.65,
        margin: float = 0.15,
        use_dynamic_thresholds: bool = True,
        use_adaptive_margin: bool = True,
        use_semantic_clustering: bool = True
    ) -> Dict:
        """Extract attributes for multiple products in parallel with enhanced multi-value selection and source tracking."""
        results = []
        successful = 0
        failed = 0
        
        ocr_service = OCRService()

        if multiple is None:
            multiple = []

        def process_product(product_data):
            """Process a single product."""
            product_id = product_data.get('product_id', f"product_{len(results)}")
            
            try:
                # Process image if URL is provided
                ocr_results = None
                ocr_text = None
                
                if process_image and product_data.get('image_url'):
                    ocr_results = ocr_service.process_image(product_data['image_url'])
                    
                    # Extract attributes from OCR
                    if ocr_results and ocr_results.get('detected_text'):
                        ocr_attrs = ProductAttributeService.extract_attributes_from_ocr(
                            ocr_results, model
                        )
                        ocr_results['extracted_attributes'] = ocr_attrs
                        
                        # Format OCR text for combining with product text
                        ocr_text = "\n".join([
                            f"{item['text']} (confidence: {item['confidence']:.2f})"
                            for item in ocr_results['detected_text']
                        ])
                
                # Combine all product information with source tracking
                product_text, source_map = ProductAttributeService.combine_product_text(
                    title=product_data.get('title'),
                    short_desc=product_data.get('short_desc'),
                    long_desc=product_data.get('long_desc'),
                    ocr_text=ocr_text
                )
                
                # Extract attributes from combined text with enhanced features
                result = ProductAttributeService.extract_attributes(
                    product_text=product_text,
                    mandatory_attrs=mandatory_attrs,
                    source_map=source_map,
                    model=model,
                    extract_additional=extract_additional,
                    multiple=multiple,
                    threshold_abs=threshold_abs,
                    margin=margin,
                    use_dynamic_thresholds=use_dynamic_thresholds,
                    use_adaptive_margin=use_adaptive_margin,
                    use_semantic_clustering=use_semantic_clustering
                )
                
                result['product_id'] = product_id
                
                # Add OCR results if available
                if ocr_results:
                    result['ocr_results'] = ocr_results
                
                # Check if extraction was successful
                if 'error' not in result:
                    return result, True
                else:
                    return result, False
                    
            except Exception as e:
                return {
                    'product_id': product_id,
                    'mandatory': {attr: [{"value": "Not Specified", "source": "error"}] for attr in mandatory_attrs.keys()},
                    'additional': {} if extract_additional else None,
                    'error': f"Processing error: {str(e)}"
                }, False

        # Process products in parallel
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_product = {
                executor.submit(process_product, product): product 
                for product in products
            }
            
            for future in as_completed(future_to_product):
                try:
                    result, success = future.result()
                    results.append(result)
                    if success:
                        successful += 1
                    else:
                        failed += 1
                except Exception as e:
                    failed += 1
                    results.append({
                        'product_id': 'unknown',
                        'mandatory': {attr: [{"value": "Not Specified", "source": "error"}] for attr in mandatory_attrs.keys()},
                        'additional': {} if extract_additional else None,
                        'error': f"Unexpected error: {str(e)}"
                    })

        return {
            'results': results,
            'total_products': len(products),
            'successful': successful,
            'failed': failed
        }

    @staticmethod
    def _clean_json_response(text: str) -> str:
        """Clean LLM response to extract valid JSON."""
        start_idx = text.find('{')
        end_idx = text.rfind('}')

        if start_idx != -1 and end_idx != -1:
            text = text[start_idx:end_idx + 1]

        if "```json" in text:
            text = text.split("```json")[1].split("```")[0].strip()
        elif "```" in text:
            text = text.split("```")[1].split("```")[0].strip()
            if text.startswith("json"):
                text = text[4:].strip()

        return text

    @staticmethod
    def _validate_response_structure(
        parsed: dict,
        mandatory_attrs: Dict[str, List[str]],
        extract_additional: bool
    ) -> dict:
        """Validate and fix the response structure."""
        expected_sections = ["mandatory"]
        if extract_additional:
            expected_sections.append("additional")

        if not all(section in parsed for section in expected_sections):
            if isinstance(parsed, dict):
                mandatory_keys = set(mandatory_attrs.keys())
                mandatory = {k: v for k, v in parsed.items() if k in mandatory_keys}
                additional = {k: v for k, v in parsed.items() if k not in mandatory_keys}

                result = {"mandatory": mandatory}
                if extract_additional:
                    result["additional"] = additional
                return result
            else:
                return ProductAttributeService._create_error_response(
                    "Invalid response structure",
                    mandatory_attrs,
                    extract_additional,
                    str(parsed)
                )

        return parsed

    @staticmethod
    def _create_error_response(
        error: str,
        mandatory_attrs: Dict[str, List[str]],
        extract_additional: bool,
        raw_output: Optional[str] = None
    ) -> dict:
        """Create a standardized error response in array format."""
        response = {
            "mandatory": {attr: [{"value": "Not Specified", "source": "error"}] for attr in mandatory_attrs.keys()},
            "error": error
        }
        if extract_additional:
            response["additional"] = {}
        if raw_output:
            response["raw_output"] = raw_output
        return response