services.py 133 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221
  1. # # # ==================== services.py ====================
  2. # # import requests
  3. # # import json
  4. # # import re
  5. # # from typing import Dict, List, Optional, Tuple
  6. # # from django.conf import settings
  7. # # from concurrent.futures import ThreadPoolExecutor, as_completed
  8. # # from sentence_transformers import SentenceTransformer, util
  9. # # import numpy as np
  10. # # from .ocr_service import OCRService
  11. # # # Initialize embedding model for normalization
  12. # # model_embedder = SentenceTransformer("all-MiniLM-L6-v2")
  13. # # class ProductAttributeService:
  14. # # """Service class for extracting product attributes using Groq LLM."""
  15. # # @staticmethod
  16. # # def normalize_dimension_text(text: str) -> str:
  17. # # """
  18. # # Normalize dimension text to match format like '16x20', '20x30', etc.
  19. # # Handles formats like '16 x 20', '16x1.5x20', '16 x 1.5 x 20 Inches'
  20. # # Returns the normalized dimension (e.g., '16x20') or empty string if not found.
  21. # # """
  22. # # if not text:
  23. # # return ""
  24. # # # Convert to lowercase and remove common units
  25. # # text = text.lower()
  26. # # text = re.sub(r'\s*(inches|inch|in|cm|centimeters|mm|millimeters)\s*', '', text, flags=re.IGNORECASE)
  27. # # # Extract all numbers from the text
  28. # # numbers = re.findall(r'\d+\.?\d*', text)
  29. # # if not numbers:
  30. # # return ""
  31. # # # Convert to floats first to handle decimals properly
  32. # # float_numbers = []
  33. # # for num in numbers:
  34. # # try:
  35. # # float_numbers.append(float(num))
  36. # # except:
  37. # # continue
  38. # # if len(float_numbers) < 2:
  39. # # return ""
  40. # # # If we have 3 dimensions, it's likely Width x Depth x Height
  41. # # # For wall art, depth is usually small (< 5), so we keep first and last
  42. # # if len(float_numbers) == 3:
  43. # # # Keep first and last values (width and height), skip middle (depth)
  44. # # float_numbers = [float_numbers[0], float_numbers[2]]
  45. # # elif len(float_numbers) > 3:
  46. # # # If more than 3 dimensions, keep the two largest
  47. # # float_numbers = sorted(float_numbers)[-2:]
  48. # # else:
  49. # # # Just 2 dimensions, use as is
  50. # # float_numbers = float_numbers[:2]
  51. # # # Format numbers: use integer if whole, else one decimal
  52. # # formatted_numbers = []
  53. # # for num in float_numbers:
  54. # # if num.is_integer():
  55. # # formatted_numbers.append(str(int(num)))
  56. # # else:
  57. # # formatted_numbers.append(f"{num:.1f}")
  58. # # # Sort to ensure consistent order (smaller x larger)
  59. # # formatted_numbers.sort(key=lambda x: float(x))
  60. # # # Return formatted dimension
  61. # # return f"{formatted_numbers[0]}x{formatted_numbers[1]}"
  62. # # @staticmethod
  63. # # def normalize_value_for_matching(value: str, attr_name: str = "") -> str:
  64. # # """
  65. # # Normalize a value based on its attribute type for better matching.
  66. # # Currently handles dimensions specially, can be extended for other attributes.
  67. # # """
  68. # # # Check if this is a dimension-related attribute
  69. # # dimension_keywords = ['dimension', 'size', 'measurement']
  70. # # if any(keyword in attr_name.lower() for keyword in dimension_keywords):
  71. # # normalized = ProductAttributeService.normalize_dimension_text(value)
  72. # # if normalized:
  73. # # return normalized
  74. # # # For other attributes, just return cleaned value
  75. # # return value.strip()
  76. # # @staticmethod
  77. # # def combine_product_text(
  78. # # title: Optional[str] = None,
  79. # # short_desc: Optional[str] = None,
  80. # # long_desc: Optional[str] = None,
  81. # # ocr_text: Optional[str] = None
  82. # # ) -> Tuple[str, Dict[str, str]]:
  83. # # """
  84. # # Combine product metadata into a single text block.
  85. # # Returns: (combined_text, source_map) where source_map tracks which text came from where
  86. # # """
  87. # # parts = []
  88. # # source_map = {}
  89. # # if title:
  90. # # title_str = str(title).strip()
  91. # # parts.append(f"Title: {title_str}")
  92. # # source_map['title'] = title_str
  93. # # if short_desc:
  94. # # short_str = str(short_desc).strip()
  95. # # parts.append(f"Description: {short_str}")
  96. # # source_map['short_desc'] = short_str
  97. # # if long_desc:
  98. # # long_str = str(long_desc).strip()
  99. # # parts.append(f"Details: {long_str}")
  100. # # source_map['long_desc'] = long_str
  101. # # if ocr_text:
  102. # # parts.append(f"OCR Text: {ocr_text}")
  103. # # source_map['ocr_text'] = ocr_text
  104. # # combined = "\n".join(parts).strip()
  105. # # if not combined:
  106. # # return "No product information available", {}
  107. # # return combined, source_map
  108. # # @staticmethod
  109. # # def find_value_source(value: str, source_map: Dict[str, str], attr_name: str = "") -> str:
  110. # # """
  111. # # Find which source(s) contain the given value.
  112. # # Returns the source name(s) where the value appears.
  113. # # Now handles normalized matching for dimensions.
  114. # # """
  115. # # value_lower = value.lower()
  116. # # # Split value into tokens for better matching
  117. # # value_tokens = set(value_lower.replace("-", " ").replace("x", " ").split())
  118. # # # Check if this is a dimension-related attribute
  119. # # is_dimension_attr = any(keyword in attr_name.lower() for keyword in ['dimension', 'size', 'measurement'])
  120. # # sources_found = []
  121. # # source_scores = {}
  122. # # for source_name, source_text in source_map.items():
  123. # # source_lower = source_text.lower()
  124. # # # Check for exact phrase match first
  125. # # if value_lower in source_lower:
  126. # # source_scores[source_name] = 1.0
  127. # # continue
  128. # # # For dimensions, check normalized match
  129. # # if is_dimension_attr:
  130. # # # Normalize the value (e.g., "16x20" stays "16x20")
  131. # # normalized_value = ProductAttributeService.normalize_dimension_text(value)
  132. # # if not normalized_value:
  133. # # normalized_value = value.replace("x", " ").strip()
  134. # # # Normalize the source text to extract dimensions
  135. # # normalized_source = ProductAttributeService.normalize_dimension_text(source_text)
  136. # # # Direct match
  137. # # if normalized_value == normalized_source:
  138. # # source_scores[source_name] = 0.95
  139. # # continue
  140. # # # Also check if the dimension numbers appear in the source
  141. # # # Extract dimension parts (e.g., "16x20" -> ["16", "20"])
  142. # # dim_parts = normalized_value.split("x") if "x" in normalized_value else []
  143. # # if len(dim_parts) == 2:
  144. # # # Check if both numbers appear in the source
  145. # # if all(part in source_text for part in dim_parts):
  146. # # source_scores[source_name] = 0.85
  147. # # continue
  148. # # # Check for token matches
  149. # # token_matches = sum(1 for token in value_tokens if token and token in source_lower)
  150. # # if token_matches > 0 and len(value_tokens) > 0:
  151. # # source_scores[source_name] = token_matches / len(value_tokens)
  152. # # # Return source with highest score, or all sources if multiple have same score
  153. # # if source_scores:
  154. # # max_score = max(source_scores.values())
  155. # # sources_found = [s for s, score in source_scores.items() if score == max_score]
  156. # # # Prioritize: title > short_desc > long_desc > ocr_text
  157. # # priority = ['title', 'short_desc', 'long_desc', 'ocr_text']
  158. # # for p in priority:
  159. # # if p in sources_found:
  160. # # return p
  161. # # return sources_found[0] if sources_found else "Not found"
  162. # # return "Not found"
  163. # # @staticmethod
  164. # # def format_visual_attributes(visual_attributes: Dict) -> Dict:
  165. # # """
  166. # # Convert visual attributes to array format with source tracking.
  167. # # Source is always 'image' for visual attributes.
  168. # # """
  169. # # formatted = {}
  170. # # for key, value in visual_attributes.items():
  171. # # if isinstance(value, list):
  172. # # # Already a list (like color_palette)
  173. # # formatted[key] = [{"value": str(item), "source": "image"} for item in value]
  174. # # elif isinstance(value, dict):
  175. # # # Nested dictionary - format recursively
  176. # # nested_formatted = {}
  177. # # for nested_key, nested_value in value.items():
  178. # # if isinstance(nested_value, list):
  179. # # nested_formatted[nested_key] = [{"value": str(item), "source": "image"} for item in nested_value]
  180. # # else:
  181. # # nested_formatted[nested_key] = [{"value": str(nested_value), "source": "image"}]
  182. # # formatted[key] = nested_formatted
  183. # # else:
  184. # # # Single value
  185. # # formatted[key] = [{"value": str(value), "source": "image"}]
  186. # # return formatted
  187. # # @staticmethod
  188. # # def extract_attributes_from_ocr(ocr_results: Dict, model: str = None) -> Dict:
  189. # # """Extract structured attributes from OCR text using LLM."""
  190. # # if model is None:
  191. # # model = settings.SUPPORTED_MODELS[0]
  192. # # detected_text = ocr_results.get('detected_text', [])
  193. # # if not detected_text:
  194. # # return {}
  195. # # # Format OCR text for prompt
  196. # # ocr_text = "\n".join([f"Text: {item['text']}, Confidence: {item['confidence']:.2f}"
  197. # # for item in detected_text])
  198. # # prompt = f"""
  199. # # You are an AI model that extracts structured attributes from OCR text detected on product images.
  200. # # Given the OCR detections below, infer the possible product attributes and return them as a clean JSON object.
  201. # # OCR Text:
  202. # # {ocr_text}
  203. # # Extract relevant attributes like:
  204. # # - brand
  205. # # - model_number
  206. # # - size (waist_size, length, etc.)
  207. # # - collection
  208. # # - any other relevant product information
  209. # # Return a JSON object with only the attributes you can confidently identify.
  210. # # If an attribute is not present, do not include it in the response.
  211. # # """
  212. # # payload = {
  213. # # "model": model,
  214. # # "messages": [
  215. # # {
  216. # # "role": "system",
  217. # # "content": "You are a helpful AI that extracts structured data from OCR output. Return only valid JSON."
  218. # # },
  219. # # {"role": "user", "content": prompt}
  220. # # ],
  221. # # "temperature": 0.2,
  222. # # "max_tokens": 500
  223. # # }
  224. # # headers = {
  225. # # "Authorization": f"Bearer {settings.GROQ_API_KEY}",
  226. # # "Content-Type": "application/json",
  227. # # }
  228. # # try:
  229. # # response = requests.post(
  230. # # settings.GROQ_API_URL,
  231. # # headers=headers,
  232. # # json=payload,
  233. # # timeout=30
  234. # # )
  235. # # response.raise_for_status()
  236. # # result_text = response.json()["choices"][0]["message"]["content"].strip()
  237. # # # Clean and parse JSON
  238. # # result_text = ProductAttributeService._clean_json_response(result_text)
  239. # # parsed = json.loads(result_text)
  240. # # # Convert to array format with source tracking
  241. # # formatted_attributes = {}
  242. # # for key, value in parsed.items():
  243. # # if key == "error":
  244. # # continue
  245. # # # Handle nested dictionaries (like size)
  246. # # if isinstance(value, dict):
  247. # # nested_formatted = {}
  248. # # for nested_key, nested_value in value.items():
  249. # # nested_formatted[nested_key] = [{"value": str(nested_value), "source": "image"}]
  250. # # formatted_attributes[key] = nested_formatted
  251. # # elif isinstance(value, list):
  252. # # # Already a list, convert each item
  253. # # formatted_attributes[key] = [{"value": str(item), "source": "image"} for item in value]
  254. # # else:
  255. # # # Single value
  256. # # formatted_attributes[key] = [{"value": str(value), "source": "image"}]
  257. # # return formatted_attributes
  258. # # except Exception as e:
  259. # # return {"error": f"Failed to extract attributes from OCR: {str(e)}"}
  260. # # @staticmethod
  261. # # def calculate_attribute_relationships(
  262. # # mandatory_attrs: Dict[str, List[str]],
  263. # # product_text: str
  264. # # ) -> Dict[str, float]:
  265. # # """
  266. # # Calculate semantic relationships between attribute values across different attributes.
  267. # # Returns a matrix of cross-attribute value similarities.
  268. # # """
  269. # # pt_emb = model_embedder.encode(product_text, convert_to_tensor=True)
  270. # # # Calculate similarities between all attribute values and product text
  271. # # attr_scores = {}
  272. # # for attr, values in mandatory_attrs.items():
  273. # # attr_scores[attr] = {}
  274. # # for val in values:
  275. # # contexts = [val, f"for {val}", f"use in {val}", f"suitable for {val}"]
  276. # # ctx_embs = [model_embedder.encode(c, convert_to_tensor=True) for c in contexts]
  277. # # sem_sim = max(float(util.cos_sim(pt_emb, ce).item()) for ce in ctx_embs)
  278. # # attr_scores[attr][val] = sem_sim
  279. # # # Calculate cross-attribute value relationships
  280. # # relationships = {}
  281. # # attr_list = list(mandatory_attrs.keys())
  282. # # for i, attr1 in enumerate(attr_list):
  283. # # for attr2 in attr_list[i+1:]:
  284. # # # Calculate pairwise similarities between values of different attributes
  285. # # for val1 in mandatory_attrs[attr1]:
  286. # # for val2 in mandatory_attrs[attr2]:
  287. # # emb1 = model_embedder.encode(val1, convert_to_tensor=True)
  288. # # emb2 = model_embedder.encode(val2, convert_to_tensor=True)
  289. # # sim = float(util.cos_sim(emb1, emb2).item())
  290. # # # Store bidirectional relationships
  291. # # key1 = f"{attr1}:{val1}->{attr2}:{val2}"
  292. # # key2 = f"{attr2}:{val2}->{attr1}:{val1}"
  293. # # relationships[key1] = sim
  294. # # relationships[key2] = sim
  295. # # return relationships
  296. # # @staticmethod
  297. # # def calculate_value_clusters(
  298. # # values: List[str],
  299. # # scores: List[Tuple[str, float]],
  300. # # cluster_threshold: float = 0.4
  301. # # ) -> List[List[str]]:
  302. # # """
  303. # # Group values into semantic clusters based on their similarity to each other.
  304. # # Returns clusters of related values.
  305. # # """
  306. # # if len(values) <= 1:
  307. # # return [[val] for val, _ in scores]
  308. # # # Get embeddings for all values
  309. # # embeddings = [model_embedder.encode(val, convert_to_tensor=True) for val in values]
  310. # # # Calculate pairwise similarities
  311. # # similarity_matrix = np.zeros((len(values), len(values)))
  312. # # for i in range(len(values)):
  313. # # for j in range(i+1, len(values)):
  314. # # sim = float(util.cos_sim(embeddings[i], embeddings[j]).item())
  315. # # similarity_matrix[i][j] = sim
  316. # # similarity_matrix[j][i] = sim
  317. # # # Simple clustering: group values with high similarity
  318. # # clusters = []
  319. # # visited = set()
  320. # # for i, (val, score) in enumerate(scores):
  321. # # if i in visited:
  322. # # continue
  323. # # cluster = [val]
  324. # # visited.add(i)
  325. # # # Find similar values
  326. # # for j in range(len(values)):
  327. # # if j not in visited and similarity_matrix[i][j] >= cluster_threshold:
  328. # # cluster.append(values[j])
  329. # # visited.add(j)
  330. # # clusters.append(cluster)
  331. # # return clusters
  332. # # @staticmethod
  333. # # def get_dynamic_threshold(
  334. # # attr: str,
  335. # # val: str,
  336. # # base_score: float,
  337. # # extracted_attrs: Dict[str, List[Dict[str, str]]],
  338. # # relationships: Dict[str, float],
  339. # # mandatory_attrs: Dict[str, List[str]],
  340. # # base_threshold: float = 0.65,
  341. # # boost_factor: float = 0.15
  342. # # ) -> float:
  343. # # """
  344. # # Calculate dynamic threshold based on relationships with already-extracted attributes.
  345. # # """
  346. # # threshold = base_threshold
  347. # # # Check relationships with already extracted attributes
  348. # # max_relationship = 0.0
  349. # # for other_attr, other_values_list in extracted_attrs.items():
  350. # # if other_attr == attr:
  351. # # continue
  352. # # for other_val_dict in other_values_list:
  353. # # other_val = other_val_dict['value']
  354. # # key = f"{attr}:{val}->{other_attr}:{other_val}"
  355. # # if key in relationships:
  356. # # max_relationship = max(max_relationship, relationships[key])
  357. # # # If strong relationship exists, lower threshold
  358. # # if max_relationship > 0.6:
  359. # # threshold = base_threshold - (boost_factor * max_relationship)
  360. # # return max(0.3, threshold)
  361. # # @staticmethod
  362. # # def get_adaptive_margin(
  363. # # scores: List[Tuple[str, float]],
  364. # # base_margin: float = 0.15,
  365. # # max_margin: float = 0.22
  366. # # ) -> float:
  367. # # """
  368. # # Calculate adaptive margin based on score distribution.
  369. # # """
  370. # # if len(scores) < 2:
  371. # # return base_margin
  372. # # score_values = [s for _, s in scores]
  373. # # best_score = score_values[0]
  374. # # # If best score is very low, use adaptive margin but be more conservative
  375. # # if best_score < 0.5:
  376. # # # Calculate score spread in top 3-4 scores only (more selective)
  377. # # top_scores = score_values[:min(4, len(score_values))]
  378. # # score_range = max(top_scores) - min(top_scores)
  379. # # # Very controlled margin increase
  380. # # if score_range < 0.30:
  381. # # # Much more conservative scaling
  382. # # score_factor = (0.5 - best_score) * 0.35
  383. # # adaptive = base_margin + score_factor + (0.30 - score_range) * 0.2
  384. # # return min(adaptive, max_margin)
  385. # # return base_margin
  386. # # @staticmethod
  387. # # def _lexical_evidence(product_text: str, label: str) -> float:
  388. # # """Calculate lexical overlap between product text and label."""
  389. # # pt = product_text.lower()
  390. # # tokens = [t for t in label.lower().replace("-", " ").split() if t]
  391. # # if not tokens:
  392. # # return 0.0
  393. # # hits = sum(1 for t in tokens if t in pt)
  394. # # return hits / len(tokens)
  395. # # @staticmethod
  396. # # def normalize_against_product_text(
  397. # # product_text: str,
  398. # # mandatory_attrs: Dict[str, List[str]],
  399. # # source_map: Dict[str, str],
  400. # # threshold_abs: float = 0.65,
  401. # # margin: float = 0.15,
  402. # # allow_multiple: bool = False,
  403. # # sem_weight: float = 0.8,
  404. # # lex_weight: float = 0.2,
  405. # # extracted_attrs: Optional[Dict[str, List[Dict[str, str]]]] = None,
  406. # # relationships: Optional[Dict[str, float]] = None,
  407. # # use_dynamic_thresholds: bool = True,
  408. # # use_adaptive_margin: bool = True,
  409. # # use_semantic_clustering: bool = True
  410. # # ) -> dict:
  411. # # """
  412. # # Score each allowed value against the product_text with dynamic thresholds.
  413. # # Returns dict with values in array format: [{"value": "...", "source": "..."}]
  414. # # """
  415. # # if extracted_attrs is None:
  416. # # extracted_attrs = {}
  417. # # if relationships is None:
  418. # # relationships = {}
  419. # # pt_emb = model_embedder.encode(product_text, convert_to_tensor=True)
  420. # # extracted = {}
  421. # # for attr, allowed_values in mandatory_attrs.items():
  422. # # scores: List[Tuple[str, float]] = []
  423. # # # Check if this is a dimension attribute
  424. # # is_dimension_attr = any(keyword in attr.lower() for keyword in ['dimension', 'size', 'measurement'])
  425. # # # Normalize product text once for dimension matching
  426. # # normalized_product_text = ProductAttributeService.normalize_dimension_text(product_text) if is_dimension_attr else ""
  427. # # for val in allowed_values:
  428. # # # For dimension attributes, try exact normalized matching first
  429. # # if is_dimension_attr:
  430. # # # Normalize the allowed value from the list
  431. # # normalized_val = ProductAttributeService.normalize_dimension_text(val)
  432. # # # If we have both normalized values and they match exactly, give highest score
  433. # # if normalized_val and normalized_product_text and normalized_val == normalized_product_text:
  434. # # scores.append((val, 1.0))
  435. # # continue
  436. # # # Also check if the normalized value appears in the original product text
  437. # # # This handles cases where the format might be slightly different
  438. # # if normalized_val:
  439. # # # Extract just the numbers for flexible matching
  440. # # val_numbers = normalized_val.split('x')
  441. # # # Check if both numbers appear in the product text in close proximity
  442. # # text_lower = product_text.lower()
  443. # # if all(num in text_lower for num in val_numbers):
  444. # # # Calculate proximity score
  445. # # idx1 = text_lower.find(val_numbers[0])
  446. # # idx2 = text_lower.find(val_numbers[1])
  447. # # if idx1 != -1 and idx2 != -1:
  448. # # distance = abs(idx2 - idx1)
  449. # # # If numbers are close together (within 20 characters), high score
  450. # # if distance < 20:
  451. # # scores.append((val, 0.95))
  452. # # continue
  453. # # # Standard semantic matching for all attributes
  454. # # contexts = [val, f"for {val}", f"use in {val}", f"suitable for {val}", f"{val} room"]
  455. # # ctx_embs = [model_embedder.encode(c, convert_to_tensor=True) for c in contexts]
  456. # # sem_sim = max(float(util.cos_sim(pt_emb, ce).item()) for ce in ctx_embs)
  457. # # lex_score = ProductAttributeService._lexical_evidence(product_text, val)
  458. # # final_score = sem_weight * sem_sim + lex_weight * lex_score
  459. # # scores.append((val, final_score))
  460. # # scores.sort(key=lambda x: x[1], reverse=True)
  461. # # best_val, best_score = scores[0]
  462. # # # Calculate adaptive margin if enabled
  463. # # effective_margin = margin
  464. # # if allow_multiple and use_adaptive_margin:
  465. # # effective_margin = ProductAttributeService.get_adaptive_margin(scores, margin)
  466. # # # Special handling for dimension attributes with exact matches
  467. # # # If we have a very high score (0.90+), it means we found an exact/normalized match
  468. # # # In this case, don't apply multiple selection logic - just return the best match
  469. # # if is_dimension_attr and best_score >= 0.90:
  470. # # source = ProductAttributeService.find_value_source(best_val, source_map, attr)
  471. # # extracted[attr] = [{"value": best_val, "source": source}]
  472. # # continue
  473. # # if not allow_multiple:
  474. # # source = ProductAttributeService.find_value_source(best_val, source_map, attr)
  475. # # extracted[attr] = [{"value": best_val, "source": source}]
  476. # # else:
  477. # # candidates = [best_val]
  478. # # use_base_threshold = best_score >= threshold_abs
  479. # # # Get semantic clusters if enabled
  480. # # clusters = []
  481. # # if use_semantic_clustering:
  482. # # clusters = ProductAttributeService.calculate_value_clusters(
  483. # # allowed_values, scores, cluster_threshold=0.4
  484. # # )
  485. # # best_cluster = next((c for c in clusters if best_val in c), [best_val])
  486. # # for val, sc in scores[1:]:
  487. # # # Skip values with very low scores
  488. # # min_score = 0.4 if is_dimension_attr else 0.3
  489. # # if sc < min_score:
  490. # # continue
  491. # # # Calculate dynamic threshold for this value
  492. # # if use_dynamic_thresholds and extracted_attrs:
  493. # # dynamic_thresh = ProductAttributeService.get_dynamic_threshold(
  494. # # attr, val, sc, extracted_attrs, relationships,
  495. # # mandatory_attrs, threshold_abs
  496. # # )
  497. # # else:
  498. # # dynamic_thresh = threshold_abs
  499. # # within_margin = (best_score - sc) <= effective_margin
  500. # # above_threshold = sc >= dynamic_thresh
  501. # # # Check if in same semantic cluster as best value
  502. # # in_cluster = False
  503. # # if use_semantic_clustering and clusters:
  504. # # in_cluster = any(best_val in c and val in c for c in clusters)
  505. # # if use_base_threshold:
  506. # # # Best score is good, require threshold OR (cluster + margin)
  507. # # if above_threshold and within_margin:
  508. # # candidates.append(val)
  509. # # elif in_cluster and within_margin:
  510. # # candidates.append(val)
  511. # # else:
  512. # # # Best score is low, use margin OR cluster logic
  513. # # if within_margin:
  514. # # candidates.append(val)
  515. # # elif in_cluster and (best_score - sc) <= effective_margin * 2.0:
  516. # # # Extended margin for cluster members
  517. # # candidates.append(val)
  518. # # # Map each candidate to its source and create array format
  519. # # extracted[attr] = []
  520. # # for candidate in candidates:
  521. # # source = ProductAttributeService.find_value_source(candidate, source_map, attr)
  522. # # extracted[attr].append({"value": candidate, "source": source})
  523. # # return extracted
  524. # # @staticmethod
  525. # # def extract_attributes(
  526. # # product_text: str,
  527. # # mandatory_attrs: Dict[str, List[str]],
  528. # # source_map: Dict[str, str] = None,
  529. # # model: str = None,
  530. # # extract_additional: bool = True,
  531. # # multiple: Optional[List[str]] = None,
  532. # # threshold_abs: float = 0.65,
  533. # # margin: float = 0.15,
  534. # # use_dynamic_thresholds: bool = True,
  535. # # use_adaptive_margin: bool = True,
  536. # # use_semantic_clustering: bool = True
  537. # # ) -> dict:
  538. # # """
  539. # # Use Groq LLM to extract attributes from any product type with enhanced multi-value selection.
  540. # # Now returns values in array format: [{"value": "...", "source": "..."}]
  541. # # """
  542. # # if model is None:
  543. # # model = settings.SUPPORTED_MODELS[0]
  544. # # if multiple is None:
  545. # # multiple = []
  546. # # if source_map is None:
  547. # # source_map = {}
  548. # # # Check if product text is empty or minimal
  549. # # if not product_text or product_text == "No product information available":
  550. # # return ProductAttributeService._create_error_response(
  551. # # "No product information provided",
  552. # # mandatory_attrs,
  553. # # extract_additional
  554. # # )
  555. # # # Create structured prompt for mandatory attributes
  556. # # mandatory_attr_list = []
  557. # # for attr_name, allowed_values in mandatory_attrs.items():
  558. # # mandatory_attr_list.append(f"{attr_name}: {', '.join(allowed_values)}")
  559. # # mandatory_attr_text = "\n".join(mandatory_attr_list)
  560. # # additional_instruction = ""
  561. # # if extract_additional:
  562. # # additional_instruction = """
  563. # # 2. Extract ADDITIONAL attributes: Identify any other relevant attributes from the product text
  564. # # that are NOT in the mandatory list. Only include attributes where you can find actual values
  565. # # in the product text. Do NOT include attributes with "Not Specified" or empty values.
  566. # # Examples of attributes to look for (only if present): Brand, Material, Size, Color, Dimensions,
  567. # # Weight, Features, Style, Theme, Pattern, Finish, Care Instructions, etc."""
  568. # # output_format = {
  569. # # "mandatory": {attr: "value or list of values" for attr in mandatory_attrs.keys()},
  570. # # }
  571. # # if extract_additional:
  572. # # output_format["additional"] = {
  573. # # "example_attribute_1": "actual value found",
  574. # # "example_attribute_2": "actual value found"
  575. # # }
  576. # # output_format["additional"]["_note"] = "Only include attributes with actual values found in text"
  577. # # prompt = f"""
  578. # # You are an intelligent product attribute extractor that works with ANY product type.
  579. # # TASK:
  580. # # 1. Extract MANDATORY attributes: For each mandatory attribute, select the most appropriate value(s)
  581. # # from the provided list. Choose the value(s) that best match the product description.
  582. # # {additional_instruction}
  583. # # Product Text:
  584. # # {product_text}
  585. # # Mandatory Attribute Lists (MUST select from these allowed values):
  586. # # {mandatory_attr_text}
  587. # # CRITICAL INSTRUCTIONS:
  588. # # - Return ONLY valid JSON, nothing else
  589. # # - No explanations, no markdown, no text before or after the JSON
  590. # # - For mandatory attributes, choose the value(s) from the provided list that best match
  591. # # - If a mandatory attribute cannot be determined from the product text, use "Not Specified"
  592. # # - Prefer exact matches from the allowed values list over generic synonyms
  593. # # - If multiple values are plausible, you MAY return more than one
  594. # # {f"- For additional attributes: ONLY include attributes where you found actual values in the product text. DO NOT include attributes with 'Not Specified', 'None', 'N/A', or empty values. If you cannot find a value for an attribute, simply don't include that attribute." if extract_additional else ""}
  595. # # - Be precise and only extract information that is explicitly stated or clearly implied
  596. # # Required Output Format:
  597. # # {json.dumps(output_format, indent=2)}
  598. # # """
  599. # # payload = {
  600. # # "model": model,
  601. # # "messages": [
  602. # # {
  603. # # "role": "system",
  604. # # "content": f"You are a precise attribute extraction model. Return ONLY valid JSON with {'mandatory and additional' if extract_additional else 'mandatory'} sections. No explanations, no markdown, no other text."
  605. # # },
  606. # # {"role": "user", "content": prompt}
  607. # # ],
  608. # # "temperature": 0.0,
  609. # # "max_tokens": 1500
  610. # # }
  611. # # headers = {
  612. # # "Authorization": f"Bearer {settings.GROQ_API_KEY}",
  613. # # "Content-Type": "application/json",
  614. # # }
  615. # # try:
  616. # # response = requests.post(
  617. # # settings.GROQ_API_URL,
  618. # # headers=headers,
  619. # # json=payload,
  620. # # timeout=30
  621. # # )
  622. # # response.raise_for_status()
  623. # # result_text = response.json()["choices"][0]["message"]["content"].strip()
  624. # # # Clean the response
  625. # # result_text = ProductAttributeService._clean_json_response(result_text)
  626. # # # Parse JSON
  627. # # parsed = json.loads(result_text)
  628. # # # Validate and restructure with source tracking
  629. # # parsed = ProductAttributeService._validate_response_structure(
  630. # # parsed, mandatory_attrs, extract_additional, source_map
  631. # # )
  632. # # # Clean up and add source tracking to additional attributes in array format
  633. # # if extract_additional and "additional" in parsed:
  634. # # cleaned_additional = {}
  635. # # for k, v in parsed["additional"].items():
  636. # # if v and v not in ["Not Specified", "None", "N/A", "", "not specified", "none", "n/a"]:
  637. # # if not (isinstance(v, str) and v.lower() in ["not specified", "none", "n/a", ""]):
  638. # # # Convert to array format if not already
  639. # # if isinstance(v, list):
  640. # # cleaned_additional[k] = []
  641. # # for item in v:
  642. # # if isinstance(item, dict) and "value" in item:
  643. # # if "source" not in item:
  644. # # item["source"] = ProductAttributeService.find_value_source(
  645. # # item["value"], source_map, k
  646. # # )
  647. # # cleaned_additional[k].append(item)
  648. # # else:
  649. # # source = ProductAttributeService.find_value_source(str(item), source_map, k)
  650. # # cleaned_additional[k].append({"value": str(item), "source": source})
  651. # # else:
  652. # # source = ProductAttributeService.find_value_source(str(v), source_map, k)
  653. # # cleaned_additional[k] = [{"value": str(v), "source": source}]
  654. # # parsed["additional"] = cleaned_additional
  655. # # # Calculate attribute relationships if using dynamic thresholds
  656. # # relationships = {}
  657. # # if use_dynamic_thresholds:
  658. # # relationships = ProductAttributeService.calculate_attribute_relationships(
  659. # # mandatory_attrs, product_text
  660. # # )
  661. # # # Process attributes in order, allowing earlier ones to influence later ones
  662. # # extracted_so_far = {}
  663. # # for attr in mandatory_attrs.keys():
  664. # # allow_multiple = attr in multiple
  665. # # result = ProductAttributeService.normalize_against_product_text(
  666. # # product_text=product_text,
  667. # # mandatory_attrs={attr: mandatory_attrs[attr]},
  668. # # source_map=source_map,
  669. # # threshold_abs=threshold_abs,
  670. # # margin=margin,
  671. # # allow_multiple=allow_multiple,
  672. # # extracted_attrs=extracted_so_far,
  673. # # relationships=relationships,
  674. # # use_dynamic_thresholds=use_dynamic_thresholds,
  675. # # use_adaptive_margin=use_adaptive_margin,
  676. # # use_semantic_clustering=use_semantic_clustering
  677. # # )
  678. # # # Result is already in array format from normalize_against_product_text
  679. # # parsed["mandatory"][attr] = result[attr]
  680. # # extracted_so_far[attr] = result[attr]
  681. # # return parsed
  682. # # except requests.exceptions.RequestException as e:
  683. # # return ProductAttributeService._create_error_response(
  684. # # str(e), mandatory_attrs, extract_additional
  685. # # )
  686. # # except json.JSONDecodeError as e:
  687. # # return ProductAttributeService._create_error_response(
  688. # # f"Invalid JSON: {str(e)}", mandatory_attrs, extract_additional, result_text
  689. # # )
  690. # # except Exception as e:
  691. # # return ProductAttributeService._create_error_response(
  692. # # str(e), mandatory_attrs, extract_additional
  693. # # )
  694. # # @staticmethod
  695. # # def extract_attributes_batch(
  696. # # products: List[Dict],
  697. # # mandatory_attrs: Dict[str, List[str]],
  698. # # model: str = None,
  699. # # extract_additional: bool = True,
  700. # # process_image: bool = True,
  701. # # max_workers: int = 5,
  702. # # multiple: Optional[List[str]] = None,
  703. # # threshold_abs: float = 0.65,
  704. # # margin: float = 0.15,
  705. # # use_dynamic_thresholds: bool = True,
  706. # # use_adaptive_margin: bool = True,
  707. # # use_semantic_clustering: bool = True
  708. # # ) -> Dict:
  709. # # """Extract attributes for multiple products in parallel with enhanced multi-value selection and source tracking."""
  710. # # results = []
  711. # # successful = 0
  712. # # failed = 0
  713. # # ocr_service = OCRService()
  714. # # if multiple is None:
  715. # # multiple = []
  716. # # def process_product(product_data):
  717. # # """Process a single product."""
  718. # # product_id = product_data.get('product_id', f"product_{len(results)}")
  719. # # try:
  720. # # # Process image if URL is provided
  721. # # ocr_results = None
  722. # # ocr_text = None
  723. # # if process_image and product_data.get('image_url'):
  724. # # ocr_results = ocr_service.process_image(product_data['image_url'])
  725. # # # Extract attributes from OCR
  726. # # if ocr_results and ocr_results.get('detected_text'):
  727. # # ocr_attrs = ProductAttributeService.extract_attributes_from_ocr(
  728. # # ocr_results, model
  729. # # )
  730. # # ocr_results['extracted_attributes'] = ocr_attrs
  731. # # # Format OCR text for combining with product text
  732. # # ocr_text = "\n".join([
  733. # # f"{item['text']} (confidence: {item['confidence']:.2f})"
  734. # # for item in ocr_results['detected_text']
  735. # # ])
  736. # # # Combine all product information with source tracking
  737. # # product_text, source_map = ProductAttributeService.combine_product_text(
  738. # # title=product_data.get('title'),
  739. # # short_desc=product_data.get('short_desc'),
  740. # # long_desc=product_data.get('long_desc'),
  741. # # ocr_text=ocr_text
  742. # # )
  743. # # # Extract attributes from combined text with enhanced features
  744. # # result = ProductAttributeService.extract_attributes(
  745. # # product_text=product_text,
  746. # # mandatory_attrs=mandatory_attrs,
  747. # # source_map=source_map,
  748. # # model=model,
  749. # # extract_additional=extract_additional,
  750. # # multiple=multiple,
  751. # # threshold_abs=threshold_abs,
  752. # # margin=margin,
  753. # # use_dynamic_thresholds=use_dynamic_thresholds,
  754. # # use_adaptive_margin=use_adaptive_margin,
  755. # # use_semantic_clustering=use_semantic_clustering
  756. # # )
  757. # # result['product_id'] = product_id
  758. # # # Add OCR results if available (already in correct format)
  759. # # if ocr_results:
  760. # # result['ocr_results'] = ocr_results
  761. # # # Check if extraction was successful
  762. # # if 'error' not in result:
  763. # # return result, True
  764. # # else:
  765. # # return result, False
  766. # # except Exception as e:
  767. # # return {
  768. # # 'product_id': product_id,
  769. # # 'mandatory': {attr: [{"value": "Not Specified", "source": "error"}] for attr in mandatory_attrs.keys()},
  770. # # 'additional': {} if extract_additional else None,
  771. # # 'error': f"Processing error: {str(e)}"
  772. # # }, False
  773. # # # Process products in parallel
  774. # # with ThreadPoolExecutor(max_workers=max_workers) as executor:
  775. # # future_to_product = {
  776. # # executor.submit(process_product, product): product
  777. # # for product in products
  778. # # }
  779. # # for future in as_completed(future_to_product):
  780. # # try:
  781. # # result, success = future.result()
  782. # # results.append(result)
  783. # # if success:
  784. # # successful += 1
  785. # # else:
  786. # # failed += 1
  787. # # except Exception as e:
  788. # # failed += 1
  789. # # results.append({
  790. # # 'product_id': 'unknown',
  791. # # 'mandatory': {attr: [{"value": "Not Specified", "source": "error"}] for attr in mandatory_attrs.keys()},
  792. # # 'additional': {} if extract_additional else None,
  793. # # 'error': f"Unexpected error: {str(e)}"
  794. # # })
  795. # # return {
  796. # # 'results': results,
  797. # # 'total_products': len(products),
  798. # # 'successful': successful,
  799. # # 'failed': failed
  800. # # }
  801. # # @staticmethod
  802. # # def _clean_json_response(text: str) -> str:
  803. # # """Clean LLM response to extract valid JSON."""
  804. # # start_idx = text.find('{')
  805. # # end_idx = text.rfind('}')
  806. # # if start_idx != -1 and end_idx != -1:
  807. # # text = text[start_idx:end_idx + 1]
  808. # # if "```json" in text:
  809. # # text = text.split("```json")[1].split("```")[0].strip()
  810. # # elif "```" in text:
  811. # # text = text.split("```")[1].split("```")[0].strip()
  812. # # if text.startswith("json"):
  813. # # text = text[4:].strip()
  814. # # return text
  815. # # @staticmethod
  816. # # def _validate_response_structure(
  817. # # parsed: dict,
  818. # # mandatory_attrs: Dict[str, List[str]],
  819. # # extract_additional: bool,
  820. # # source_map: Dict[str, str] = None
  821. # # ) -> dict:
  822. # # """Validate and fix the response structure, ensuring array format with source tracking."""
  823. # # if source_map is None:
  824. # # source_map = {}
  825. # # expected_sections = ["mandatory"]
  826. # # if extract_additional:
  827. # # expected_sections.append("additional")
  828. # # if not all(section in parsed for section in expected_sections):
  829. # # if isinstance(parsed, dict):
  830. # # mandatory_keys = set(mandatory_attrs.keys())
  831. # # mandatory = {k: v for k, v in parsed.items() if k in mandatory_keys}
  832. # # additional = {k: v for k, v in parsed.items() if k not in mandatory_keys}
  833. # # result = {"mandatory": mandatory}
  834. # # if extract_additional:
  835. # # result["additional"] = additional
  836. # # parsed = result
  837. # # else:
  838. # # return ProductAttributeService._create_error_response(
  839. # # "Invalid response structure",
  840. # # mandatory_attrs,
  841. # # extract_additional,
  842. # # str(parsed)
  843. # # )
  844. # # # Convert mandatory attributes to array format with source tracking
  845. # # if "mandatory" in parsed:
  846. # # converted_mandatory = {}
  847. # # for attr, value in parsed["mandatory"].items():
  848. # # if isinstance(value, list):
  849. # # # Already in array format, ensure each item has source
  850. # # converted_mandatory[attr] = []
  851. # # for item in value:
  852. # # if isinstance(item, dict) and "value" in item:
  853. # # # Already has proper structure
  854. # # if "source" not in item:
  855. # # item["source"] = ProductAttributeService.find_value_source(
  856. # # item["value"], source_map, attr
  857. # # )
  858. # # converted_mandatory[attr].append(item)
  859. # # else:
  860. # # # Convert string to proper format
  861. # # source = ProductAttributeService.find_value_source(str(item), source_map, attr)
  862. # # converted_mandatory[attr].append({"value": str(item), "source": source})
  863. # # else:
  864. # # # Single value - convert to array format
  865. # # source = ProductAttributeService.find_value_source(str(value), source_map, attr)
  866. # # converted_mandatory[attr] = [{"value": str(value), "source": source}]
  867. # # parsed["mandatory"] = converted_mandatory
  868. # # return parsed
  869. # # @staticmethod
  870. # # def _create_error_response(
  871. # # error: str,
  872. # # mandatory_attrs: Dict[str, List[str]],
  873. # # extract_additional: bool,
  874. # # raw_output: Optional[str] = None
  875. # # ) -> dict:
  876. # # """Create a standardized error response in array format."""
  877. # # response = {
  878. # # "mandatory": {attr: [{"value": "Not Specified", "source": "error"}] for attr in mandatory_attrs.keys()},
  879. # # "error": error
  880. # # }
  881. # # if extract_additional:
  882. # # response["additional"] = {}
  883. # # if raw_output:
  884. # # response["raw_output"] = raw_output
  885. # # return response
  886. # # ==================== services.py (OPTIMIZED) ====================
  887. # import requests
  888. # import json
  889. # import re
  890. # import hashlib
  891. # import logging
  892. # from typing import Dict, List, Optional, Tuple
  893. # from django.conf import settings
  894. # from concurrent.futures import ThreadPoolExecutor, as_completed
  895. # from sentence_transformers import SentenceTransformer, util
  896. # import numpy as np
  897. # from .ocr_service import OCRService
  898. # logger = logging.getLogger(__name__)
  899. # # Initialize embedding model for normalization (SINGLETON)
  900. # model_embedder = SentenceTransformer("all-MiniLM-L6-v2")
  901. # # ==================== CACHING CLASSES ====================
  902. # class SimpleCache:
  903. # """
  904. # In-memory cache for attribute extraction results.
  905. # No Redis required - uses Python dict with automatic size management.
  906. # """
  907. # _cache = {}
  908. # _max_size = 1000 # Maximum number of cached items
  909. # @classmethod
  910. # def get(cls, key: str) -> Optional[Dict]:
  911. # """Get cached value by key"""
  912. # return cls._cache.get(key)
  913. # @classmethod
  914. # def set(cls, key: str, value: Dict):
  915. # """Set cache value with automatic LRU cleanup"""
  916. # # Simple LRU: clear oldest 20% if cache is full
  917. # if len(cls._cache) >= cls._max_size:
  918. # items = list(cls._cache.items())
  919. # # Keep newest 80%
  920. # cls._cache = dict(items[int(cls._max_size * 0.2):])
  921. # logger.info(f"Cache cleaned: kept {len(cls._cache)} items")
  922. # cls._cache[key] = value
  923. # @classmethod
  924. # def clear(cls):
  925. # """Clear entire cache"""
  926. # cls._cache.clear()
  927. # logger.info("Cache cleared")
  928. # @classmethod
  929. # def get_stats(cls) -> Dict:
  930. # """Get cache statistics"""
  931. # return {
  932. # "size": len(cls._cache),
  933. # "max_size": cls._max_size,
  934. # "usage_percent": round(len(cls._cache) / cls._max_size * 100, 2)
  935. # }
  936. # class EmbeddingCache:
  937. # """
  938. # Cache for sentence transformer embeddings.
  939. # Significantly reduces embedding computation time.
  940. # """
  941. # _cache = {}
  942. # _max_size = 500
  943. # _hit_count = 0
  944. # _miss_count = 0
  945. # @classmethod
  946. # def get_embedding(cls, text: str, model):
  947. # """Get or compute embedding with caching"""
  948. # if text in cls._cache:
  949. # cls._hit_count += 1
  950. # return cls._cache[text]
  951. # # Cache miss - compute embedding
  952. # cls._miss_count += 1
  953. # # Auto-cleanup if cache is full
  954. # if len(cls._cache) >= cls._max_size:
  955. # items = list(cls._cache.items())
  956. # cls._cache = dict(items[int(cls._max_size * 0.3):])
  957. # logger.info(f"Embedding cache cleaned: kept {len(cls._cache)} items")
  958. # # Compute and cache
  959. # embedding = model.encode(text, convert_to_tensor=True)
  960. # cls._cache[text] = embedding
  961. # return embedding
  962. # @classmethod
  963. # def clear(cls):
  964. # """Clear embedding cache"""
  965. # cls._cache.clear()
  966. # cls._hit_count = 0
  967. # cls._miss_count = 0
  968. # logger.info("Embedding cache cleared")
  969. # @classmethod
  970. # def get_stats(cls) -> Dict:
  971. # """Get cache statistics"""
  972. # total = cls._hit_count + cls._miss_count
  973. # hit_rate = (cls._hit_count / total * 100) if total > 0 else 0
  974. # return {
  975. # "size": len(cls._cache),
  976. # "max_size": cls._max_size,
  977. # "hits": cls._hit_count,
  978. # "misses": cls._miss_count,
  979. # "hit_rate_percent": round(hit_rate, 2)
  980. # }
  981. # # ==================== MAIN SERVICE CLASS ====================
  982. # class ProductAttributeService:
  983. # """Service class for extracting product attributes using Groq LLM."""
  984. # @staticmethod
  985. # def _generate_cache_key(product_text: str, mandatory_attrs: Dict) -> str:
  986. # """
  987. # Generate a unique cache key from product text and attributes.
  988. # Uses MD5 hash for consistent short keys.
  989. # """
  990. # # Sort attributes for consistent hashing
  991. # attrs_str = json.dumps(mandatory_attrs, sort_keys=True)
  992. # content = f"{product_text}:{attrs_str}"
  993. # return f"attr_{hashlib.md5(content.encode()).hexdigest()}"
  994. # @staticmethod
  995. # def normalize_dimension_text(text: str) -> str:
  996. # """
  997. # Normalize dimension text to match format like '16x20', '20x30', etc.
  998. # Handles formats like '16 x 20', '16x1.5x20', '16 x 1.5 x 20 Inches'
  999. # Returns the normalized dimension (e.g., '16x20') or empty string if not found.
  1000. # """
  1001. # if not text:
  1002. # return ""
  1003. # # Convert to lowercase and remove common units
  1004. # text = text.lower()
  1005. # text = re.sub(r'\s*(inches|inch|in|cm|centimeters|mm|millimeters)\s*', '', text, flags=re.IGNORECASE)
  1006. # # Extract all numbers from the text
  1007. # numbers = re.findall(r'\d+\.?\d*', text)
  1008. # if not numbers:
  1009. # return ""
  1010. # # Convert to floats first to handle decimals properly
  1011. # float_numbers = []
  1012. # for num in numbers:
  1013. # try:
  1014. # float_numbers.append(float(num))
  1015. # except:
  1016. # continue
  1017. # if len(float_numbers) < 2:
  1018. # return ""
  1019. # # If we have 3 dimensions, it's likely Width x Depth x Height
  1020. # # For wall art, depth is usually small (< 5), so we keep first and last
  1021. # if len(float_numbers) == 3:
  1022. # # Keep first and last values (width and height), skip middle (depth)
  1023. # float_numbers = [float_numbers[0], float_numbers[2]]
  1024. # elif len(float_numbers) > 3:
  1025. # # If more than 3 dimensions, keep the two largest
  1026. # float_numbers = sorted(float_numbers)[-2:]
  1027. # else:
  1028. # # Just 2 dimensions, use as is
  1029. # float_numbers = float_numbers[:2]
  1030. # # Format numbers: use integer if whole, else one decimal
  1031. # formatted_numbers = []
  1032. # for num in float_numbers:
  1033. # if num.is_integer():
  1034. # formatted_numbers.append(str(int(num)))
  1035. # else:
  1036. # formatted_numbers.append(f"{num:.1f}")
  1037. # # Sort to ensure consistent order (smaller x larger)
  1038. # formatted_numbers.sort(key=lambda x: float(x))
  1039. # # Return formatted dimension
  1040. # return f"{formatted_numbers[0]}x{formatted_numbers[1]}"
  1041. # @staticmethod
  1042. # def normalize_value_for_matching(value: str, attr_name: str = "") -> str:
  1043. # """
  1044. # Normalize a value based on its attribute type for better matching.
  1045. # Currently handles dimensions specially, can be extended for other attributes.
  1046. # """
  1047. # # Check if this is a dimension-related attribute
  1048. # dimension_keywords = ['dimension', 'size', 'measurement']
  1049. # if any(keyword in attr_name.lower() for keyword in dimension_keywords):
  1050. # normalized = ProductAttributeService.normalize_dimension_text(value)
  1051. # if normalized:
  1052. # return normalized
  1053. # # For other attributes, just return cleaned value
  1054. # return value.strip()
  1055. # @staticmethod
  1056. # def combine_product_text(
  1057. # title: Optional[str] = None,
  1058. # short_desc: Optional[str] = None,
  1059. # long_desc: Optional[str] = None,
  1060. # ocr_text: Optional[str] = None
  1061. # ) -> Tuple[str, Dict[str, str]]:
  1062. # """
  1063. # Combine product metadata into a single text block.
  1064. # Returns: (combined_text, source_map) where source_map tracks which text came from where
  1065. # """
  1066. # parts = []
  1067. # source_map = {}
  1068. # if title:
  1069. # title_str = str(title).strip()
  1070. # parts.append(f"Title: {title_str}")
  1071. # source_map['title'] = title_str
  1072. # if short_desc:
  1073. # short_str = str(short_desc).strip()
  1074. # parts.append(f"Description: {short_str}")
  1075. # source_map['short_desc'] = short_str
  1076. # if long_desc:
  1077. # long_str = str(long_desc).strip()
  1078. # parts.append(f"Details: {long_str}")
  1079. # source_map['long_desc'] = long_str
  1080. # if ocr_text:
  1081. # parts.append(f"OCR Text: {ocr_text}")
  1082. # source_map['ocr_text'] = ocr_text
  1083. # combined = "\n".join(parts).strip()
  1084. # if not combined:
  1085. # return "No product information available", {}
  1086. # return combined, source_map
  1087. # @staticmethod
  1088. # def find_value_source(value: str, source_map: Dict[str, str], attr_name: str = "") -> str:
  1089. # """
  1090. # Find which source(s) contain the given value.
  1091. # Returns the source name(s) where the value appears.
  1092. # Now handles normalized matching for dimensions.
  1093. # """
  1094. # value_lower = value.lower()
  1095. # # Split value into tokens for better matching
  1096. # value_tokens = set(value_lower.replace("-", " ").replace("x", " ").split())
  1097. # # Check if this is a dimension-related attribute
  1098. # is_dimension_attr = any(keyword in attr_name.lower() for keyword in ['dimension', 'size', 'measurement'])
  1099. # sources_found = []
  1100. # source_scores = {}
  1101. # for source_name, source_text in source_map.items():
  1102. # source_lower = source_text.lower()
  1103. # # Check for exact phrase match first
  1104. # if value_lower in source_lower:
  1105. # source_scores[source_name] = 1.0
  1106. # continue
  1107. # # For dimensions, check normalized match
  1108. # if is_dimension_attr:
  1109. # # Normalize the value (e.g., "16x20" stays "16x20")
  1110. # normalized_value = ProductAttributeService.normalize_dimension_text(value)
  1111. # if not normalized_value:
  1112. # normalized_value = value.replace("x", " ").strip()
  1113. # # Normalize the source text to extract dimensions
  1114. # normalized_source = ProductAttributeService.normalize_dimension_text(source_text)
  1115. # # Direct match
  1116. # if normalized_value == normalized_source:
  1117. # source_scores[source_name] = 0.95
  1118. # continue
  1119. # # Also check if the dimension numbers appear in the source
  1120. # # Extract dimension parts (e.g., "16x20" -> ["16", "20"])
  1121. # dim_parts = normalized_value.split("x") if "x" in normalized_value else []
  1122. # if len(dim_parts) == 2:
  1123. # # Check if both numbers appear in the source
  1124. # if all(part in source_text for part in dim_parts):
  1125. # source_scores[source_name] = 0.85
  1126. # continue
  1127. # # Check for token matches
  1128. # token_matches = sum(1 for token in value_tokens if token and token in source_lower)
  1129. # if token_matches > 0 and len(value_tokens) > 0:
  1130. # source_scores[source_name] = token_matches / len(value_tokens)
  1131. # # Return source with highest score, or all sources if multiple have same score
  1132. # if source_scores:
  1133. # max_score = max(source_scores.values())
  1134. # sources_found = [s for s, score in source_scores.items() if score == max_score]
  1135. # # Prioritize: title > short_desc > long_desc > ocr_text
  1136. # priority = ['title', 'short_desc', 'long_desc', 'ocr_text']
  1137. # for p in priority:
  1138. # if p in sources_found:
  1139. # return p
  1140. # return sources_found[0] if sources_found else "Not found"
  1141. # return "Not found"
  1142. # @staticmethod
  1143. # def format_visual_attributes(visual_attributes: Dict) -> Dict:
  1144. # """
  1145. # Convert visual attributes to array format with source tracking.
  1146. # Source is always 'image' for visual attributes.
  1147. # """
  1148. # formatted = {}
  1149. # for key, value in visual_attributes.items():
  1150. # if isinstance(value, list):
  1151. # # Already a list (like color_palette)
  1152. # formatted[key] = [{"value": str(item), "source": "image"} for item in value]
  1153. # elif isinstance(value, dict):
  1154. # # Nested dictionary - format recursively
  1155. # nested_formatted = {}
  1156. # for nested_key, nested_value in value.items():
  1157. # if isinstance(nested_value, list):
  1158. # nested_formatted[nested_key] = [{"value": str(item), "source": "image"} for item in nested_value]
  1159. # else:
  1160. # nested_formatted[nested_key] = [{"value": str(nested_value), "source": "image"}]
  1161. # formatted[key] = nested_formatted
  1162. # else:
  1163. # # Single value
  1164. # formatted[key] = [{"value": str(value), "source": "image"}]
  1165. # return formatted
  1166. # @staticmethod
  1167. # def extract_attributes_from_ocr(ocr_results: Dict, model: str = None) -> Dict:
  1168. # """Extract structured attributes from OCR text using LLM."""
  1169. # if model is None:
  1170. # model = settings.SUPPORTED_MODELS[0]
  1171. # detected_text = ocr_results.get('detected_text', [])
  1172. # if not detected_text:
  1173. # return {}
  1174. # # Format OCR text for prompt
  1175. # ocr_text = "\n".join([f"Text: {item['text']}, Confidence: {item['confidence']:.2f}"
  1176. # for item in detected_text])
  1177. # prompt = f"""
  1178. # You are an AI model that extracts structured attributes from OCR text detected on product images.
  1179. # Given the OCR detections below, infer the possible product attributes and return them as a clean JSON object.
  1180. # OCR Text:
  1181. # {ocr_text}
  1182. # Extract relevant attributes like:
  1183. # - brand
  1184. # - model_number
  1185. # - size (waist_size, length, etc.)
  1186. # - collection
  1187. # - any other relevant product information
  1188. # Return a JSON object with only the attributes you can confidently identify.
  1189. # If an attribute is not present, do not include it in the response.
  1190. # """
  1191. # payload = {
  1192. # "model": model,
  1193. # "messages": [
  1194. # {
  1195. # "role": "system",
  1196. # "content": "You are a helpful AI that extracts structured data from OCR output. Return only valid JSON."
  1197. # },
  1198. # {"role": "user", "content": prompt}
  1199. # ],
  1200. # "temperature": 0.2,
  1201. # "max_tokens": 500
  1202. # }
  1203. # headers = {
  1204. # "Authorization": f"Bearer {settings.GROQ_API_KEY}",
  1205. # "Content-Type": "application/json",
  1206. # }
  1207. # try:
  1208. # response = requests.post(
  1209. # settings.GROQ_API_URL,
  1210. # headers=headers,
  1211. # json=payload,
  1212. # timeout=30
  1213. # )
  1214. # response.raise_for_status()
  1215. # result_text = response.json()["choices"][0]["message"]["content"].strip()
  1216. # # Clean and parse JSON
  1217. # result_text = ProductAttributeService._clean_json_response(result_text)
  1218. # parsed = json.loads(result_text)
  1219. # # Convert to array format with source tracking
  1220. # formatted_attributes = {}
  1221. # for key, value in parsed.items():
  1222. # if key == "error":
  1223. # continue
  1224. # # Handle nested dictionaries (like size)
  1225. # if isinstance(value, dict):
  1226. # nested_formatted = {}
  1227. # for nested_key, nested_value in value.items():
  1228. # nested_formatted[nested_key] = [{"value": str(nested_value), "source": "image"}]
  1229. # formatted_attributes[key] = nested_formatted
  1230. # elif isinstance(value, list):
  1231. # # Already a list, convert each item
  1232. # formatted_attributes[key] = [{"value": str(item), "source": "image"} for item in value]
  1233. # else:
  1234. # # Single value
  1235. # formatted_attributes[key] = [{"value": str(value), "source": "image"}]
  1236. # return formatted_attributes
  1237. # except Exception as e:
  1238. # logger.error(f"OCR attribute extraction failed: {str(e)}")
  1239. # return {"error": f"Failed to extract attributes from OCR: {str(e)}"}
  1240. # @staticmethod
  1241. # def calculate_attribute_relationships(
  1242. # mandatory_attrs: Dict[str, List[str]],
  1243. # product_text: str
  1244. # ) -> Dict[str, float]:
  1245. # """
  1246. # Calculate semantic relationships between attribute values across different attributes.
  1247. # Returns a matrix of cross-attribute value similarities.
  1248. # """
  1249. # # USE EMBEDDING CACHE
  1250. # pt_emb = EmbeddingCache.get_embedding(product_text, model_embedder)
  1251. # # Calculate similarities between all attribute values and product text
  1252. # attr_scores = {}
  1253. # for attr, values in mandatory_attrs.items():
  1254. # attr_scores[attr] = {}
  1255. # for val in values:
  1256. # contexts = [val, f"for {val}", f"use in {val}", f"suitable for {val}"]
  1257. # # USE EMBEDDING CACHE FOR CONTEXTS
  1258. # ctx_embs = [EmbeddingCache.get_embedding(c, model_embedder) for c in contexts]
  1259. # sem_sim = max(float(util.cos_sim(pt_emb, ce).item()) for ce in ctx_embs)
  1260. # attr_scores[attr][val] = sem_sim
  1261. # # Calculate cross-attribute value relationships
  1262. # relationships = {}
  1263. # attr_list = list(mandatory_attrs.keys())
  1264. # for i, attr1 in enumerate(attr_list):
  1265. # for attr2 in attr_list[i+1:]:
  1266. # # Calculate pairwise similarities between values of different attributes
  1267. # for val1 in mandatory_attrs[attr1]:
  1268. # for val2 in mandatory_attrs[attr2]:
  1269. # # USE EMBEDDING CACHE
  1270. # emb1 = EmbeddingCache.get_embedding(val1, model_embedder)
  1271. # emb2 = EmbeddingCache.get_embedding(val2, model_embedder)
  1272. # sim = float(util.cos_sim(emb1, emb2).item())
  1273. # # Store bidirectional relationships
  1274. # key1 = f"{attr1}:{val1}->{attr2}:{val2}"
  1275. # key2 = f"{attr2}:{val2}->{attr1}:{val1}"
  1276. # relationships[key1] = sim
  1277. # relationships[key2] = sim
  1278. # return relationships
  1279. # @staticmethod
  1280. # def calculate_value_clusters(
  1281. # values: List[str],
  1282. # scores: List[Tuple[str, float]],
  1283. # cluster_threshold: float = 0.4
  1284. # ) -> List[List[str]]:
  1285. # """
  1286. # Group values into semantic clusters based on their similarity to each other.
  1287. # Returns clusters of related values.
  1288. # """
  1289. # if len(values) <= 1:
  1290. # return [[val] for val, _ in scores]
  1291. # # Get embeddings for all values - USE CACHE
  1292. # embeddings = [EmbeddingCache.get_embedding(val, model_embedder) for val in values]
  1293. # # Calculate pairwise similarities
  1294. # similarity_matrix = np.zeros((len(values), len(values)))
  1295. # for i in range(len(values)):
  1296. # for j in range(i+1, len(values)):
  1297. # sim = float(util.cos_sim(embeddings[i], embeddings[j]).item())
  1298. # similarity_matrix[i][j] = sim
  1299. # similarity_matrix[j][i] = sim
  1300. # # Simple clustering: group values with high similarity
  1301. # clusters = []
  1302. # visited = set()
  1303. # for i, (val, score) in enumerate(scores):
  1304. # if i in visited:
  1305. # continue
  1306. # cluster = [val]
  1307. # visited.add(i)
  1308. # # Find similar values
  1309. # for j in range(len(values)):
  1310. # if j not in visited and similarity_matrix[i][j] >= cluster_threshold:
  1311. # cluster.append(values[j])
  1312. # visited.add(j)
  1313. # clusters.append(cluster)
  1314. # return clusters
  1315. # @staticmethod
  1316. # def get_dynamic_threshold(
  1317. # attr: str,
  1318. # val: str,
  1319. # base_score: float,
  1320. # extracted_attrs: Dict[str, List[Dict[str, str]]],
  1321. # relationships: Dict[str, float],
  1322. # mandatory_attrs: Dict[str, List[str]],
  1323. # base_threshold: float = 0.65,
  1324. # boost_factor: float = 0.15
  1325. # ) -> float:
  1326. # """
  1327. # Calculate dynamic threshold based on relationships with already-extracted attributes.
  1328. # """
  1329. # threshold = base_threshold
  1330. # # Check relationships with already extracted attributes
  1331. # max_relationship = 0.0
  1332. # for other_attr, other_values_list in extracted_attrs.items():
  1333. # if other_attr == attr:
  1334. # continue
  1335. # for other_val_dict in other_values_list:
  1336. # other_val = other_val_dict['value']
  1337. # key = f"{attr}:{val}->{other_attr}:{other_val}"
  1338. # if key in relationships:
  1339. # max_relationship = max(max_relationship, relationships[key])
  1340. # # If strong relationship exists, lower threshold
  1341. # if max_relationship > 0.6:
  1342. # threshold = base_threshold - (boost_factor * max_relationship)
  1343. # return max(0.3, threshold)
  1344. # @staticmethod
  1345. # def get_adaptive_margin(
  1346. # scores: List[Tuple[str, float]],
  1347. # base_margin: float = 0.15,
  1348. # max_margin: float = 0.22
  1349. # ) -> float:
  1350. # """
  1351. # Calculate adaptive margin based on score distribution.
  1352. # """
  1353. # if len(scores) < 2:
  1354. # return base_margin
  1355. # score_values = [s for _, s in scores]
  1356. # best_score = score_values[0]
  1357. # # If best score is very low, use adaptive margin but be more conservative
  1358. # if best_score < 0.5:
  1359. # # Calculate score spread in top 3-4 scores only (more selective)
  1360. # top_scores = score_values[:min(4, len(score_values))]
  1361. # score_range = max(top_scores) - min(top_scores)
  1362. # # Very controlled margin increase
  1363. # if score_range < 0.30:
  1364. # # Much more conservative scaling
  1365. # score_factor = (0.5 - best_score) * 0.35
  1366. # adaptive = base_margin + score_factor + (0.30 - score_range) * 0.2
  1367. # return min(adaptive, max_margin)
  1368. # return base_margin
  1369. # @staticmethod
  1370. # def _lexical_evidence(product_text: str, label: str) -> float:
  1371. # """Calculate lexical overlap between product text and label."""
  1372. # pt = product_text.lower()
  1373. # tokens = [t for t in label.lower().replace("-", " ").split() if t]
  1374. # if not tokens:
  1375. # return 0.0
  1376. # hits = sum(1 for t in tokens if t in pt)
  1377. # return hits / len(tokens)
  1378. # @staticmethod
  1379. # def normalize_against_product_text(
  1380. # product_text: str,
  1381. # mandatory_attrs: Dict[str, List[str]],
  1382. # source_map: Dict[str, str],
  1383. # threshold_abs: float = 0.65,
  1384. # margin: float = 0.15,
  1385. # allow_multiple: bool = False,
  1386. # sem_weight: float = 0.8,
  1387. # lex_weight: float = 0.2,
  1388. # extracted_attrs: Optional[Dict[str, List[Dict[str, str]]]] = None,
  1389. # relationships: Optional[Dict[str, float]] = None,
  1390. # use_dynamic_thresholds: bool = True,
  1391. # use_adaptive_margin: bool = True,
  1392. # use_semantic_clustering: bool = True
  1393. # ) -> dict:
  1394. # """
  1395. # Score each allowed value against the product_text with dynamic thresholds.
  1396. # Returns dict with values in array format: [{"value": "...", "source": "..."}]
  1397. # ⚡ OPTIMIZED: Uses EmbeddingCache for faster computation
  1398. # """
  1399. # if extracted_attrs is None:
  1400. # extracted_attrs = {}
  1401. # if relationships is None:
  1402. # relationships = {}
  1403. # # USE EMBEDDING CACHE - CRITICAL OPTIMIZATION
  1404. # pt_emb = EmbeddingCache.get_embedding(product_text, model_embedder)
  1405. # extracted = {}
  1406. # for attr, allowed_values in mandatory_attrs.items():
  1407. # scores: List[Tuple[str, float]] = []
  1408. # # Check if this is a dimension attribute
  1409. # is_dimension_attr = any(keyword in attr.lower() for keyword in ['dimension', 'size', 'measurement'])
  1410. # # Normalize product text once for dimension matching
  1411. # normalized_product_text = ProductAttributeService.normalize_dimension_text(product_text) if is_dimension_attr else ""
  1412. # for val in allowed_values:
  1413. # # For dimension attributes, try exact normalized matching first
  1414. # if is_dimension_attr:
  1415. # # Normalize the allowed value from the list
  1416. # normalized_val = ProductAttributeService.normalize_dimension_text(val)
  1417. # # If we have both normalized values and they match exactly, give highest score
  1418. # if normalized_val and normalized_product_text and normalized_val == normalized_product_text:
  1419. # scores.append((val, 1.0))
  1420. # continue
  1421. # # Also check if the normalized value appears in the original product text
  1422. # if normalized_val:
  1423. # val_numbers = normalized_val.split('x')
  1424. # text_lower = product_text.lower()
  1425. # if all(num in text_lower for num in val_numbers):
  1426. # idx1 = text_lower.find(val_numbers[0])
  1427. # idx2 = text_lower.find(val_numbers[1])
  1428. # if idx1 != -1 and idx2 != -1:
  1429. # distance = abs(idx2 - idx1)
  1430. # if distance < 20:
  1431. # scores.append((val, 0.95))
  1432. # continue
  1433. # # Standard semantic matching - USE EMBEDDING CACHE
  1434. # contexts = [val, f"for {val}", f"use in {val}", f"suitable for {val}", f"{val} room"]
  1435. # ctx_embs = [EmbeddingCache.get_embedding(c, model_embedder) for c in contexts]
  1436. # sem_sim = max(float(util.cos_sim(pt_emb, ce).item()) for ce in ctx_embs)
  1437. # lex_score = ProductAttributeService._lexical_evidence(product_text, val)
  1438. # final_score = sem_weight * sem_sim + lex_weight * lex_score
  1439. # scores.append((val, final_score))
  1440. # scores.sort(key=lambda x: x[1], reverse=True)
  1441. # best_val, best_score = scores[0]
  1442. # # Calculate adaptive margin if enabled
  1443. # effective_margin = margin
  1444. # if allow_multiple and use_adaptive_margin:
  1445. # effective_margin = ProductAttributeService.get_adaptive_margin(scores, margin)
  1446. # # Special handling for dimension attributes with exact matches
  1447. # if is_dimension_attr and best_score >= 0.90:
  1448. # source = ProductAttributeService.find_value_source(best_val, source_map, attr)
  1449. # extracted[attr] = [{"value": best_val, "source": source}]
  1450. # continue
  1451. # if not allow_multiple:
  1452. # source = ProductAttributeService.find_value_source(best_val, source_map, attr)
  1453. # extracted[attr] = [{"value": best_val, "source": source}]
  1454. # else:
  1455. # candidates = [best_val]
  1456. # use_base_threshold = best_score >= threshold_abs
  1457. # # Get semantic clusters if enabled
  1458. # clusters = []
  1459. # if use_semantic_clustering:
  1460. # clusters = ProductAttributeService.calculate_value_clusters(
  1461. # allowed_values, scores, cluster_threshold=0.4
  1462. # )
  1463. # best_cluster = next((c for c in clusters if best_val in c), [best_val])
  1464. # for val, sc in scores[1:]:
  1465. # min_score = 0.4 if is_dimension_attr else 0.3
  1466. # if sc < min_score:
  1467. # continue
  1468. # if use_dynamic_thresholds and extracted_attrs:
  1469. # dynamic_thresh = ProductAttributeService.get_dynamic_threshold(
  1470. # attr, val, sc, extracted_attrs, relationships,
  1471. # mandatory_attrs, threshold_abs
  1472. # )
  1473. # else:
  1474. # dynamic_thresh = threshold_abs
  1475. # within_margin = (best_score - sc) <= effective_margin
  1476. # above_threshold = sc >= dynamic_thresh
  1477. # in_cluster = False
  1478. # if use_semantic_clustering and clusters:
  1479. # in_cluster = any(best_val in c and val in c for c in clusters)
  1480. # if use_base_threshold:
  1481. # if above_threshold and within_margin:
  1482. # candidates.append(val)
  1483. # elif in_cluster and within_margin:
  1484. # candidates.append(val)
  1485. # else:
  1486. # if within_margin:
  1487. # candidates.append(val)
  1488. # elif in_cluster and (best_score - sc) <= effective_margin * 2.0:
  1489. # candidates.append(val)
  1490. # extracted[attr] = []
  1491. # for candidate in candidates:
  1492. # source = ProductAttributeService.find_value_source(candidate, source_map, attr)
  1493. # extracted[attr].append({"value": candidate, "source": source})
  1494. # return extracted
  1495. # @staticmethod
  1496. # def extract_attributes(
  1497. # product_text: str,
  1498. # mandatory_attrs: Dict[str, List[str]],
  1499. # source_map: Dict[str, str] = None,
  1500. # model: str = None,
  1501. # extract_additional: bool = True,
  1502. # multiple: Optional[List[str]] = None,
  1503. # threshold_abs: float = 0.65,
  1504. # margin: float = 0.15,
  1505. # use_dynamic_thresholds: bool = True,
  1506. # use_adaptive_margin: bool = True,
  1507. # use_semantic_clustering: bool = True,
  1508. # use_cache: bool = True # ⚡ NEW: Enable/disable caching
  1509. # ) -> dict:
  1510. # """
  1511. # Use Groq LLM to extract attributes from any product type with enhanced multi-value selection.
  1512. # Now returns values in array format: [{"value": "...", "source": "..."}]
  1513. # ⚡ OPTIMIZED: Added caching layer for faster repeated requests
  1514. # """
  1515. # if model is None:
  1516. # model = settings.SUPPORTED_MODELS[0]
  1517. # if multiple is None:
  1518. # multiple = []
  1519. # if source_map is None:
  1520. # source_map = {}
  1521. # # Check if product text is empty or minimal
  1522. # if not product_text or product_text == "No product information available":
  1523. # return ProductAttributeService._create_error_response(
  1524. # "No product information provided",
  1525. # mandatory_attrs,
  1526. # extract_additional
  1527. # )
  1528. # # ⚡ CHECK CACHE FIRST
  1529. # if use_cache:
  1530. # cache_key = ProductAttributeService._generate_cache_key(product_text, mandatory_attrs)
  1531. # cached_result = SimpleCache.get(cache_key)
  1532. # if cached_result:
  1533. # logger.info(f"✓ Cache hit - returning cached result")
  1534. # return cached_result
  1535. # # Create structured prompt for mandatory attributes
  1536. # mandatory_attr_list = []
  1537. # for attr_name, allowed_values in mandatory_attrs.items():
  1538. # mandatory_attr_list.append(f"{attr_name}: {', '.join(allowed_values)}")
  1539. # mandatory_attr_text = "\n".join(mandatory_attr_list)
  1540. # additional_instruction = ""
  1541. # if extract_additional:
  1542. # additional_instruction = """
  1543. # 2. Extract ADDITIONAL attributes: Identify any other relevant attributes from the product text
  1544. # that are NOT in the mandatory list. Only include attributes where you can find actual values
  1545. # in the product text. Do NOT include attributes with "Not Specified" or empty values.
  1546. # Examples of attributes to look for (only if present): Brand, Material, Size, Color, Dimensions,
  1547. # Weight, Features, Style, Theme, Pattern, Finish, Care Instructions, etc."""
  1548. # output_format = {
  1549. # "mandatory": {attr: "value or list of values" for attr in mandatory_attrs.keys()},
  1550. # }
  1551. # if extract_additional:
  1552. # output_format["additional"] = {
  1553. # "example_attribute_1": "actual value found",
  1554. # "example_attribute_2": "actual value found"
  1555. # }
  1556. # output_format["additional"]["_note"] = "Only include attributes with actual values found in text"
  1557. # prompt = f"""
  1558. # You are an intelligent product attribute extractor that works with ANY product type.
  1559. # TASK:
  1560. # 1. Extract MANDATORY attributes: For each mandatory attribute, select the most appropriate value(s)
  1561. # from the provided list. Choose the value(s) that best match the product description.
  1562. # {additional_instruction}
  1563. # Product Text:
  1564. # {product_text}
  1565. # Mandatory Attribute Lists (MUST select from these allowed values):
  1566. # {mandatory_attr_text}
  1567. # CRITICAL INSTRUCTIONS:
  1568. # - Return ONLY valid JSON, nothing else
  1569. # - No explanations, no markdown, no text before or after the JSON
  1570. # - For mandatory attributes, choose the value(s) from the provided list that best match
  1571. # - If a mandatory attribute cannot be determined from the product text, use "Not Specified"
  1572. # - Prefer exact matches from the allowed values list over generic synonyms
  1573. # - If multiple values are plausible, you MAY return more than one
  1574. # {f"- For additional attributes: ONLY include attributes where you found actual values in the product text. DO NOT include attributes with 'Not Specified', 'None', 'N/A', or empty values. If you cannot find a value for an attribute, simply don't include that attribute." if extract_additional else ""}
  1575. # - Be precise and only extract information that is explicitly stated or clearly implied
  1576. # Required Output Format:
  1577. # {json.dumps(output_format, indent=2)}
  1578. # """
  1579. # payload = {
  1580. # "model": model,
  1581. # "messages": [
  1582. # {
  1583. # "role": "system",
  1584. # "content": f"You are a precise attribute extraction model. Return ONLY valid JSON with {'mandatory and additional' if extract_additional else 'mandatory'} sections. No explanations, no markdown, no other text."
  1585. # },
  1586. # {"role": "user", "content": prompt}
  1587. # ],
  1588. # "temperature": 0.0,
  1589. # "max_tokens": 1500
  1590. # }
  1591. # headers = {
  1592. # "Authorization": f"Bearer {settings.GROQ_API_KEY}",
  1593. # "Content-Type": "application/json",
  1594. # }
  1595. # try:
  1596. # response = requests.post(
  1597. # settings.GROQ_API_URL,
  1598. # headers=headers,
  1599. # json=payload,
  1600. # timeout=30
  1601. # )
  1602. # response.raise_for_status()
  1603. # result_text = response.json()["choices"][0]["message"]["content"].strip()
  1604. # # Clean the response
  1605. # result_text = ProductAttributeService._clean_json_response(result_text)
  1606. # # Parse JSON
  1607. # parsed = json.loads(result_text)
  1608. # # Validate and restructure with source tracking
  1609. # parsed = ProductAttributeService._validate_response_structure(
  1610. # parsed, mandatory_attrs, extract_additional, source_map
  1611. # )
  1612. # # Clean up and add source tracking to additional attributes in array format
  1613. # if extract_additional and "additional" in parsed:
  1614. # cleaned_additional = {}
  1615. # for k, v in parsed["additional"].items():
  1616. # if v and v not in ["Not Specified", "None", "N/A", "", "not specified", "none", "n/a"]:
  1617. # if not (isinstance(v, str) and v.lower() in ["not specified", "none", "n/a", ""]):
  1618. # # Convert to array format if not already
  1619. # if isinstance(v, list):
  1620. # cleaned_additional[k] = []
  1621. # for item in v:
  1622. # if isinstance(item, dict) and "value" in item:
  1623. # if "source" not in item:
  1624. # item["source"] = ProductAttributeService.find_value_source(
  1625. # item["value"], source_map, k
  1626. # )
  1627. # cleaned_additional[k].append(item)
  1628. # else:
  1629. # source = ProductAttributeService.find_value_source(str(item), source_map, k)
  1630. # cleaned_additional[k].append({"value": str(item), "source": source})
  1631. # else:
  1632. # source = ProductAttributeService.find_value_source(str(v), source_map, k)
  1633. # cleaned_additional[k] = [{"value": str(v), "source": source}]
  1634. # parsed["additional"] = cleaned_additional
  1635. # # Calculate attribute relationships if using dynamic thresholds
  1636. # relationships = {}
  1637. # if use_dynamic_thresholds:
  1638. # relationships = ProductAttributeService.calculate_attribute_relationships(
  1639. # mandatory_attrs, product_text
  1640. # )
  1641. # # Process attributes in order, allowing earlier ones to influence later ones
  1642. # extracted_so_far = {}
  1643. # for attr in mandatory_attrs.keys():
  1644. # allow_multiple = attr in multiple
  1645. # result = ProductAttributeService.normalize_against_product_text(
  1646. # product_text=product_text,
  1647. # mandatory_attrs={attr: mandatory_attrs[attr]},
  1648. # source_map=source_map,
  1649. # threshold_abs=threshold_abs,
  1650. # margin=margin,
  1651. # allow_multiple=allow_multiple,
  1652. # extracted_attrs=extracted_so_far,
  1653. # relationships=relationships,
  1654. # use_dynamic_thresholds=use_dynamic_thresholds,
  1655. # use_adaptive_margin=use_adaptive_margin,
  1656. # use_semantic_clustering=use_semantic_clustering
  1657. # )
  1658. # # Result is already in array format from normalize_against_product_text
  1659. # parsed["mandatory"][attr] = result[attr]
  1660. # extracted_so_far[attr] = result[attr]
  1661. # # ⚡ CACHE THE RESULT
  1662. # if use_cache:
  1663. # SimpleCache.set(cache_key, parsed)
  1664. # logger.info(f"✓ Cached extraction result")
  1665. # return parsed
  1666. # except requests.exceptions.RequestException as e:
  1667. # logger.error(f"Request exception: {str(e)}")
  1668. # return ProductAttributeService._create_error_response(
  1669. # str(e), mandatory_attrs, extract_additional
  1670. # )
  1671. # except json.JSONDecodeError as e:
  1672. # logger.error(f"JSON decode error: {str(e)}")
  1673. # return ProductAttributeService._create_error_response(
  1674. # f"Invalid JSON: {str(e)}", mandatory_attrs, extract_additional, result_text
  1675. # )
  1676. # except Exception as e:
  1677. # logger.error(f"Unexpected error: {str(e)}")
  1678. # return ProductAttributeService._create_error_response(
  1679. # str(e), mandatory_attrs, extract_additional
  1680. # )
  1681. # @staticmethod
  1682. # def extract_attributes_batch(
  1683. # products: List[Dict],
  1684. # mandatory_attrs: Dict[str, List[str]],
  1685. # model: str = None,
  1686. # extract_additional: bool = True,
  1687. # process_image: bool = True,
  1688. # max_workers: int = 5,
  1689. # multiple: Optional[List[str]] = None,
  1690. # threshold_abs: float = 0.65,
  1691. # margin: float = 0.15,
  1692. # use_dynamic_thresholds: bool = True,
  1693. # use_adaptive_margin: bool = True,
  1694. # use_semantic_clustering: bool = True,
  1695. # use_cache: bool = True # ⚡ NEW: Enable caching for batch processing
  1696. # ) -> Dict:
  1697. # """
  1698. # Extract attributes for multiple products in parallel with enhanced multi-value selection and source tracking.
  1699. # ⚡ OPTIMIZED: Added caching support for batch operations
  1700. # """
  1701. # results = []
  1702. # successful = 0
  1703. # failed = 0
  1704. # ocr_service = OCRService()
  1705. # if multiple is None:
  1706. # multiple = []
  1707. # def process_product(product_data):
  1708. # """Process a single product."""
  1709. # product_id = product_data.get('product_id', f"product_{len(results)}")
  1710. # try:
  1711. # # Process image if URL is provided
  1712. # ocr_results = None
  1713. # ocr_text = None
  1714. # if process_image and product_data.get('image_url'):
  1715. # ocr_results = ocr_service.process_image(product_data['image_url'])
  1716. # # Extract attributes from OCR
  1717. # if ocr_results and ocr_results.get('detected_text'):
  1718. # ocr_attrs = ProductAttributeService.extract_attributes_from_ocr(
  1719. # ocr_results, model
  1720. # )
  1721. # ocr_results['extracted_attributes'] = ocr_attrs
  1722. # # Format OCR text for combining with product text
  1723. # ocr_text = "\n".join([
  1724. # f"{item['text']} (confidence: {item['confidence']:.2f})"
  1725. # for item in ocr_results['detected_text']
  1726. # ])
  1727. # # Combine all product information with source tracking
  1728. # product_text, source_map = ProductAttributeService.combine_product_text(
  1729. # title=product_data.get('title'),
  1730. # short_desc=product_data.get('short_desc'),
  1731. # long_desc=product_data.get('long_desc'),
  1732. # ocr_text=ocr_text
  1733. # )
  1734. # # Extract attributes from combined text with enhanced features
  1735. # result = ProductAttributeService.extract_attributes(
  1736. # product_text=product_text,
  1737. # mandatory_attrs=mandatory_attrs,
  1738. # source_map=source_map,
  1739. # model=model,
  1740. # extract_additional=extract_additional,
  1741. # multiple=multiple,
  1742. # threshold_abs=threshold_abs,
  1743. # margin=margin,
  1744. # use_dynamic_thresholds=use_dynamic_thresholds,
  1745. # use_adaptive_margin=use_adaptive_margin,
  1746. # use_semantic_clustering=use_semantic_clustering,
  1747. # use_cache=use_cache # ⚡ Pass cache flag
  1748. # )
  1749. # result['product_id'] = product_id
  1750. # # Add OCR results if available (already in correct format)
  1751. # if ocr_results:
  1752. # result['ocr_results'] = ocr_results
  1753. # # Check if extraction was successful
  1754. # if 'error' not in result:
  1755. # return result, True
  1756. # else:
  1757. # return result, False
  1758. # except Exception as e:
  1759. # logger.error(f"Error processing product {product_id}: {str(e)}")
  1760. # return {
  1761. # 'product_id': product_id,
  1762. # 'mandatory': {attr: [{"value": "Not Specified", "source": "error"}] for attr in mandatory_attrs.keys()},
  1763. # 'additional': {} if extract_additional else None,
  1764. # 'error': f"Processing error: {str(e)}"
  1765. # }, False
  1766. # # Process products in parallel
  1767. # with ThreadPoolExecutor(max_workers=max_workers) as executor:
  1768. # future_to_product = {
  1769. # executor.submit(process_product, product): product
  1770. # for product in products
  1771. # }
  1772. # for future in as_completed(future_to_product):
  1773. # try:
  1774. # result, success = future.result()
  1775. # results.append(result)
  1776. # if success:
  1777. # successful += 1
  1778. # else:
  1779. # failed += 1
  1780. # except Exception as e:
  1781. # logger.error(f"Future execution error: {str(e)}")
  1782. # failed += 1
  1783. # results.append({
  1784. # 'product_id': 'unknown',
  1785. # 'mandatory': {attr: [{"value": "Not Specified", "source": "error"}] for attr in mandatory_attrs.keys()},
  1786. # 'additional': {} if extract_additional else None,
  1787. # 'error': f"Unexpected error: {str(e)}"
  1788. # })
  1789. # return {
  1790. # 'results': results,
  1791. # 'total_products': len(products),
  1792. # 'successful': successful,
  1793. # 'failed': failed,
  1794. # 'cache_stats': SimpleCache.get_stats(), # ⚡ Include cache statistics
  1795. # 'embedding_cache_stats': EmbeddingCache.get_stats() # ⚡ Include embedding cache stats
  1796. # }
  1797. # @staticmethod
  1798. # def _clean_json_response(text: str) -> str:
  1799. # """Clean LLM response to extract valid JSON."""
  1800. # start_idx = text.find('{')
  1801. # end_idx = text.rfind('}')
  1802. # if start_idx != -1 and end_idx != -1:
  1803. # text = text[start_idx:end_idx + 1]
  1804. # if "```json" in text:
  1805. # text = text.split("```json")[1].split("```")[0].strip()
  1806. # elif "```" in text:
  1807. # text = text.split("```")[1].split("```")[0].strip()
  1808. # if text.startswith("json"):
  1809. # text = text[4:].strip()
  1810. # return text
  1811. # @staticmethod
  1812. # def _validate_response_structure(
  1813. # parsed: dict,
  1814. # mandatory_attrs: Dict[str, List[str]],
  1815. # extract_additional: bool,
  1816. # source_map: Dict[str, str] = None
  1817. # ) -> dict:
  1818. # """Validate and fix the response structure, ensuring array format with source tracking."""
  1819. # if source_map is None:
  1820. # source_map = {}
  1821. # expected_sections = ["mandatory"]
  1822. # if extract_additional:
  1823. # expected_sections.append("additional")
  1824. # if not all(section in parsed for section in expected_sections):
  1825. # if isinstance(parsed, dict):
  1826. # mandatory_keys = set(mandatory_attrs.keys())
  1827. # mandatory = {k: v for k, v in parsed.items() if k in mandatory_keys}
  1828. # additional = {k: v for k, v in parsed.items() if k not in mandatory_keys}
  1829. # result = {"mandatory": mandatory}
  1830. # if extract_additional:
  1831. # result["additional"] = additional
  1832. # parsed = result
  1833. # else:
  1834. # return ProductAttributeService._create_error_response(
  1835. # "Invalid response structure",
  1836. # mandatory_attrs,
  1837. # extract_additional,
  1838. # str(parsed)
  1839. # )
  1840. # # Convert mandatory attributes to array format with source tracking
  1841. # if "mandatory" in parsed:
  1842. # converted_mandatory = {}
  1843. # for attr, value in parsed["mandatory"].items():
  1844. # if isinstance(value, list):
  1845. # # Already in array format, ensure each item has source
  1846. # converted_mandatory[attr] = []
  1847. # for item in value:
  1848. # if isinstance(item, dict) and "value" in item:
  1849. # # Already has proper structure
  1850. # if "source" not in item:
  1851. # item["source"] = ProductAttributeService.find_value_source(
  1852. # item["value"], source_map, attr
  1853. # )
  1854. # converted_mandatory[attr].append(item)
  1855. # else:
  1856. # # Convert string to proper format
  1857. # source = ProductAttributeService.find_value_source(str(item), source_map, attr)
  1858. # converted_mandatory[attr].append({"value": str(item), "source": source})
  1859. # else:
  1860. # # Single value - convert to array format
  1861. # source = ProductAttributeService.find_value_source(str(value), source_map, attr)
  1862. # converted_mandatory[attr] = [{"value": str(value), "source": source}]
  1863. # parsed["mandatory"] = converted_mandatory
  1864. # return parsed
  1865. # @staticmethod
  1866. # def _create_error_response(
  1867. # error: str,
  1868. # mandatory_attrs: Dict[str, List[str]],
  1869. # extract_additional: bool,
  1870. # raw_output: Optional[str] = None
  1871. # ) -> dict:
  1872. # """Create a standardized error response in array format."""
  1873. # response = {
  1874. # "mandatory": {attr: [{"value": "Not Specified", "source": "error"}] for attr in mandatory_attrs.keys()},
  1875. # "error": error
  1876. # }
  1877. # if extract_additional:
  1878. # response["additional"] = {}
  1879. # if raw_output:
  1880. # response["raw_output"] = raw_output
  1881. # return response
  1882. # @staticmethod
  1883. # def get_cache_stats() -> Dict:
  1884. # """
  1885. # Get statistics for both caches.
  1886. # ⚡ NEW: Utility method to monitor cache performance
  1887. # """
  1888. # return {
  1889. # "simple_cache": SimpleCache.get_stats(),
  1890. # "embedding_cache": EmbeddingCache.get_stats()
  1891. # }
  1892. # @staticmethod
  1893. # def clear_all_caches():
  1894. # """
  1895. # Clear both caches.
  1896. # ⚡ NEW: Utility method to reset caches when needed
  1897. # """
  1898. # SimpleCache.clear()
  1899. # EmbeddingCache.clear()
  1900. # logger.info("All caches cleared")
  1901. # ==================== services.py (PERFORMANCE OPTIMIZED) ====================
  1902. import requests
  1903. import json
  1904. import re
  1905. import hashlib
  1906. import logging
  1907. from typing import Dict, List, Optional, Tuple
  1908. from django.conf import settings
  1909. from concurrent.futures import ThreadPoolExecutor, as_completed
  1910. from sentence_transformers import SentenceTransformer, util
  1911. import numpy as np
  1912. logger = logging.getLogger(__name__)
  1913. # ⚡ CRITICAL FIX: Initialize embedding model ONCE at module level
  1914. print("Loading sentence transformer model (one-time initialization)...")
  1915. model_embedder = SentenceTransformer("all-MiniLM-L6-v2")
  1916. # Disable progress bars to prevent "Batches: 100%" spam
  1917. import os
  1918. os.environ['TOKENIZERS_PARALLELISM'] = 'false'
  1919. print("✓ Model loaded successfully")
  1920. # ==================== CACHING CLASSES ====================
  1921. class SimpleCache:
  1922. """In-memory cache for attribute extraction results."""
  1923. _cache = {}
  1924. _max_size = 1000
  1925. @classmethod
  1926. def get(cls, key: str) -> Optional[Dict]:
  1927. return cls._cache.get(key)
  1928. @classmethod
  1929. def set(cls, key: str, value: Dict):
  1930. if len(cls._cache) >= cls._max_size:
  1931. items = list(cls._cache.items())
  1932. cls._cache = dict(items[int(cls._max_size * 0.2):])
  1933. cls._cache[key] = value
  1934. @classmethod
  1935. def clear(cls):
  1936. cls._cache.clear()
  1937. @classmethod
  1938. def get_stats(cls) -> Dict:
  1939. return {
  1940. "size": len(cls._cache),
  1941. "max_size": cls._max_size,
  1942. "usage_percent": round(len(cls._cache) / cls._max_size * 100, 2)
  1943. }
  1944. class EmbeddingCache:
  1945. """Cache for sentence transformer embeddings."""
  1946. _cache = {}
  1947. _max_size = 500
  1948. _hit_count = 0
  1949. _miss_count = 0
  1950. @classmethod
  1951. def get_embedding(cls, text: str, model):
  1952. """Get or compute embedding with caching"""
  1953. if text in cls._cache:
  1954. cls._hit_count += 1
  1955. return cls._cache[text]
  1956. cls._miss_count += 1
  1957. if len(cls._cache) >= cls._max_size:
  1958. items = list(cls._cache.items())
  1959. cls._cache = dict(items[int(cls._max_size * 0.3):])
  1960. # ⚡ CRITICAL: Disable verbose output
  1961. import warnings
  1962. with warnings.catch_warnings():
  1963. warnings.simplefilter("ignore")
  1964. embedding = model.encode(text, convert_to_tensor=True, show_progress_bar=False)
  1965. cls._cache[text] = embedding
  1966. return embedding
  1967. @classmethod
  1968. def clear(cls):
  1969. cls._cache.clear()
  1970. cls._hit_count = 0
  1971. cls._miss_count = 0
  1972. @classmethod
  1973. def get_stats(cls) -> Dict:
  1974. total = cls._hit_count + cls._miss_count
  1975. hit_rate = (cls._hit_count / total * 100) if total > 0 else 0
  1976. return {
  1977. "size": len(cls._cache),
  1978. "hits": cls._hit_count,
  1979. "misses": cls._miss_count,
  1980. "hit_rate_percent": round(hit_rate, 2)
  1981. }
  1982. # ==================== MAIN SERVICE CLASS ====================
  1983. class ProductAttributeService:
  1984. """Service class for extracting product attributes using Groq LLM."""
  1985. @staticmethod
  1986. def _generate_cache_key(product_text: str, mandatory_attrs: Dict) -> str:
  1987. """Generate cache key from product text and attributes."""
  1988. attrs_str = json.dumps(mandatory_attrs, sort_keys=True)
  1989. content = f"{product_text}:{attrs_str}"
  1990. return f"attr_{hashlib.md5(content.encode()).hexdigest()}"
  1991. @staticmethod
  1992. def normalize_dimension_text(text: str) -> str:
  1993. """Normalize dimension text to format like '16x20'."""
  1994. if not text:
  1995. return ""
  1996. text = text.lower()
  1997. text = re.sub(r'\s*(inches|inch|in|cm|centimeters|mm|millimeters)\s*', '', text, flags=re.IGNORECASE)
  1998. numbers = re.findall(r'\d+\.?\d*', text)
  1999. if not numbers:
  2000. return ""
  2001. float_numbers = []
  2002. for num in numbers:
  2003. try:
  2004. float_numbers.append(float(num))
  2005. except:
  2006. continue
  2007. if len(float_numbers) < 2:
  2008. return ""
  2009. if len(float_numbers) == 3:
  2010. float_numbers = [float_numbers[0], float_numbers[2]]
  2011. elif len(float_numbers) > 3:
  2012. float_numbers = sorted(float_numbers)[-2:]
  2013. else:
  2014. float_numbers = float_numbers[:2]
  2015. formatted_numbers = []
  2016. for num in float_numbers:
  2017. if num.is_integer():
  2018. formatted_numbers.append(str(int(num)))
  2019. else:
  2020. formatted_numbers.append(f"{num:.1f}")
  2021. formatted_numbers.sort(key=lambda x: float(x))
  2022. return f"{formatted_numbers[0]}x{formatted_numbers[1]}"
  2023. @staticmethod
  2024. def normalize_value_for_matching(value: str, attr_name: str = "") -> str:
  2025. """Normalize a value based on its attribute type."""
  2026. dimension_keywords = ['dimension', 'size', 'measurement']
  2027. if any(keyword in attr_name.lower() for keyword in dimension_keywords):
  2028. normalized = ProductAttributeService.normalize_dimension_text(value)
  2029. if normalized:
  2030. return normalized
  2031. return value.strip()
  2032. @staticmethod
  2033. def combine_product_text(
  2034. title: Optional[str] = None,
  2035. short_desc: Optional[str] = None,
  2036. long_desc: Optional[str] = None,
  2037. ocr_text: Optional[str] = None
  2038. ) -> Tuple[str, Dict[str, str]]:
  2039. """Combine product metadata into a single text block."""
  2040. parts = []
  2041. source_map = {}
  2042. if title:
  2043. title_str = str(title).strip()
  2044. parts.append(f"Title: {title_str}")
  2045. source_map['title'] = title_str
  2046. if short_desc:
  2047. short_str = str(short_desc).strip()
  2048. parts.append(f"Description: {short_str}")
  2049. source_map['short_desc'] = short_str
  2050. if long_desc:
  2051. long_str = str(long_desc).strip()
  2052. parts.append(f"Details: {long_str}")
  2053. source_map['long_desc'] = long_str
  2054. if ocr_text:
  2055. parts.append(f"OCR Text: {ocr_text}")
  2056. source_map['ocr_text'] = ocr_text
  2057. combined = "\n".join(parts).strip()
  2058. if not combined:
  2059. return "No product information available", {}
  2060. return combined, source_map
  2061. @staticmethod
  2062. def find_value_source(value: str, source_map: Dict[str, str], attr_name: str = "") -> str:
  2063. """Find which source(s) contain the given value."""
  2064. value_lower = value.lower()
  2065. value_tokens = set(value_lower.replace("-", " ").replace("x", " ").split())
  2066. is_dimension_attr = any(keyword in attr_name.lower() for keyword in ['dimension', 'size', 'measurement'])
  2067. sources_found = []
  2068. source_scores = {}
  2069. for source_name, source_text in source_map.items():
  2070. source_lower = source_text.lower()
  2071. if value_lower in source_lower:
  2072. source_scores[source_name] = 1.0
  2073. continue
  2074. if is_dimension_attr:
  2075. normalized_value = ProductAttributeService.normalize_dimension_text(value)
  2076. if not normalized_value:
  2077. normalized_value = value.replace("x", " ").strip()
  2078. normalized_source = ProductAttributeService.normalize_dimension_text(source_text)
  2079. if normalized_value == normalized_source:
  2080. source_scores[source_name] = 0.95
  2081. continue
  2082. dim_parts = normalized_value.split("x") if "x" in normalized_value else []
  2083. if len(dim_parts) == 2:
  2084. if all(part in source_text for part in dim_parts):
  2085. source_scores[source_name] = 0.85
  2086. continue
  2087. token_matches = sum(1 for token in value_tokens if token and token in source_lower)
  2088. if token_matches > 0 and len(value_tokens) > 0:
  2089. source_scores[source_name] = token_matches / len(value_tokens)
  2090. if source_scores:
  2091. max_score = max(source_scores.values())
  2092. sources_found = [s for s, score in source_scores.items() if score == max_score]
  2093. priority = ['title', 'short_desc', 'long_desc', 'ocr_text']
  2094. for p in priority:
  2095. if p in sources_found:
  2096. return p
  2097. return sources_found[0] if sources_found else "Not found"
  2098. return "Not found"
  2099. @staticmethod
  2100. def format_visual_attributes(visual_attributes: Dict) -> Dict:
  2101. """Convert visual attributes to array format with source tracking."""
  2102. formatted = {}
  2103. for key, value in visual_attributes.items():
  2104. if isinstance(value, list):
  2105. formatted[key] = [{"value": str(item), "source": "image"} for item in value]
  2106. elif isinstance(value, dict):
  2107. nested_formatted = {}
  2108. for nested_key, nested_value in value.items():
  2109. if isinstance(nested_value, list):
  2110. nested_formatted[nested_key] = [{"value": str(item), "source": "image"} for item in nested_value]
  2111. else:
  2112. nested_formatted[nested_key] = [{"value": str(nested_value), "source": "image"}]
  2113. formatted[key] = nested_formatted
  2114. else:
  2115. formatted[key] = [{"value": str(value), "source": "image"}]
  2116. return formatted
  2117. @staticmethod
  2118. def extract_attributes_from_ocr(ocr_results: Dict, model: str = None) -> Dict:
  2119. """Extract structured attributes from OCR text using LLM."""
  2120. if model is None:
  2121. model = settings.SUPPORTED_MODELS[0]
  2122. detected_text = ocr_results.get('detected_text', [])
  2123. if not detected_text:
  2124. return {}
  2125. ocr_text = "\n".join([f"Text: {item['text']}, Confidence: {item['confidence']:.2f}"
  2126. for item in detected_text])
  2127. prompt = f"""
  2128. You are an AI model that extracts structured attributes from OCR text detected on product images.
  2129. Given the OCR detections below, infer the possible product attributes and return them as a clean JSON object.
  2130. OCR Text:
  2131. {ocr_text}
  2132. Extract relevant attributes like:
  2133. - brand
  2134. - model_number
  2135. - size (waist_size, length, etc.)
  2136. - collection
  2137. - any other relevant product information
  2138. Return a JSON object with only the attributes you can confidently identify.
  2139. If an attribute is not present, do not include it in the response.
  2140. """
  2141. payload = {
  2142. "model": model,
  2143. "messages": [
  2144. {
  2145. "role": "system",
  2146. "content": "You are a helpful AI that extracts structured data from OCR output. Return only valid JSON."
  2147. },
  2148. {"role": "user", "content": prompt}
  2149. ],
  2150. "temperature": 0.2,
  2151. "max_tokens": 500
  2152. }
  2153. headers = {
  2154. "Authorization": f"Bearer {settings.GROQ_API_KEY}",
  2155. "Content-Type": "application/json",
  2156. }
  2157. try:
  2158. response = requests.post(
  2159. settings.GROQ_API_URL,
  2160. headers=headers,
  2161. json=payload,
  2162. timeout=30
  2163. )
  2164. response.raise_for_status()
  2165. result_text = response.json()["choices"][0]["message"]["content"].strip()
  2166. result_text = ProductAttributeService._clean_json_response(result_text)
  2167. parsed = json.loads(result_text)
  2168. formatted_attributes = {}
  2169. for key, value in parsed.items():
  2170. if key == "error":
  2171. continue
  2172. if isinstance(value, dict):
  2173. nested_formatted = {}
  2174. for nested_key, nested_value in value.items():
  2175. nested_formatted[nested_key] = [{"value": str(nested_value), "source": "image"}]
  2176. formatted_attributes[key] = nested_formatted
  2177. elif isinstance(value, list):
  2178. formatted_attributes[key] = [{"value": str(item), "source": "image"} for item in value]
  2179. else:
  2180. formatted_attributes[key] = [{"value": str(value), "source": "image"}]
  2181. return formatted_attributes
  2182. except Exception as e:
  2183. logger.error(f"OCR attribute extraction failed: {str(e)}")
  2184. return {"error": f"Failed to extract attributes from OCR: {str(e)}"}
  2185. @staticmethod
  2186. def calculate_attribute_relationships(
  2187. mandatory_attrs: Dict[str, List[str]],
  2188. product_text: str
  2189. ) -> Dict[str, float]:
  2190. """Calculate semantic relationships between attribute values."""
  2191. pt_emb = EmbeddingCache.get_embedding(product_text, model_embedder)
  2192. attr_scores = {}
  2193. for attr, values in mandatory_attrs.items():
  2194. attr_scores[attr] = {}
  2195. for val in values:
  2196. contexts = [val, f"for {val}", f"use in {val}", f"suitable for {val}"]
  2197. ctx_embs = [EmbeddingCache.get_embedding(c, model_embedder) for c in contexts]
  2198. sem_sim = max(float(util.cos_sim(pt_emb, ce).item()) for ce in ctx_embs)
  2199. attr_scores[attr][val] = sem_sim
  2200. relationships = {}
  2201. attr_list = list(mandatory_attrs.keys())
  2202. for i, attr1 in enumerate(attr_list):
  2203. for attr2 in attr_list[i+1:]:
  2204. for val1 in mandatory_attrs[attr1]:
  2205. for val2 in mandatory_attrs[attr2]:
  2206. emb1 = EmbeddingCache.get_embedding(val1, model_embedder)
  2207. emb2 = EmbeddingCache.get_embedding(val2, model_embedder)
  2208. sim = float(util.cos_sim(emb1, emb2).item())
  2209. key1 = f"{attr1}:{val1}->{attr2}:{val2}"
  2210. key2 = f"{attr2}:{val2}->{attr1}:{val1}"
  2211. relationships[key1] = sim
  2212. relationships[key2] = sim
  2213. return relationships
  2214. @staticmethod
  2215. def calculate_value_clusters(
  2216. values: List[str],
  2217. scores: List[Tuple[str, float]],
  2218. cluster_threshold: float = 0.4
  2219. ) -> List[List[str]]:
  2220. """Group values into semantic clusters."""
  2221. if len(values) <= 1:
  2222. return [[val] for val, _ in scores]
  2223. embeddings = [EmbeddingCache.get_embedding(val, model_embedder) for val in values]
  2224. similarity_matrix = np.zeros((len(values), len(values)))
  2225. for i in range(len(values)):
  2226. for j in range(i+1, len(values)):
  2227. sim = float(util.cos_sim(embeddings[i], embeddings[j]).item())
  2228. similarity_matrix[i][j] = sim
  2229. similarity_matrix[j][i] = sim
  2230. clusters = []
  2231. visited = set()
  2232. for i, (val, score) in enumerate(scores):
  2233. if i in visited:
  2234. continue
  2235. cluster = [val]
  2236. visited.add(i)
  2237. for j in range(len(values)):
  2238. if j not in visited and similarity_matrix[i][j] >= cluster_threshold:
  2239. cluster.append(values[j])
  2240. visited.add(j)
  2241. clusters.append(cluster)
  2242. return clusters
  2243. @staticmethod
  2244. def get_dynamic_threshold(
  2245. attr: str,
  2246. val: str,
  2247. base_score: float,
  2248. extracted_attrs: Dict[str, List[Dict[str, str]]],
  2249. relationships: Dict[str, float],
  2250. mandatory_attrs: Dict[str, List[str]],
  2251. base_threshold: float = 0.65,
  2252. boost_factor: float = 0.15
  2253. ) -> float:
  2254. """Calculate dynamic threshold based on relationships."""
  2255. threshold = base_threshold
  2256. max_relationship = 0.0
  2257. for other_attr, other_values_list in extracted_attrs.items():
  2258. if other_attr == attr:
  2259. continue
  2260. for other_val_dict in other_values_list:
  2261. other_val = other_val_dict['value']
  2262. key = f"{attr}:{val}->{other_attr}:{other_val}"
  2263. if key in relationships:
  2264. max_relationship = max(max_relationship, relationships[key])
  2265. if max_relationship > 0.6:
  2266. threshold = base_threshold - (boost_factor * max_relationship)
  2267. return max(0.3, threshold)
  2268. @staticmethod
  2269. def get_adaptive_margin(
  2270. scores: List[Tuple[str, float]],
  2271. base_margin: float = 0.15,
  2272. max_margin: float = 0.22
  2273. ) -> float:
  2274. """Calculate adaptive margin based on score distribution."""
  2275. if len(scores) < 2:
  2276. return base_margin
  2277. score_values = [s for _, s in scores]
  2278. best_score = score_values[0]
  2279. if best_score < 0.5:
  2280. top_scores = score_values[:min(4, len(score_values))]
  2281. score_range = max(top_scores) - min(top_scores)
  2282. if score_range < 0.30:
  2283. score_factor = (0.5 - best_score) * 0.35
  2284. adaptive = base_margin + score_factor + (0.30 - score_range) * 0.2
  2285. return min(adaptive, max_margin)
  2286. return base_margin
  2287. @staticmethod
  2288. def _lexical_evidence(product_text: str, label: str) -> float:
  2289. """Calculate lexical overlap between product text and label."""
  2290. pt = product_text.lower()
  2291. tokens = [t for t in label.lower().replace("-", " ").split() if t]
  2292. if not tokens:
  2293. return 0.0
  2294. hits = sum(1 for t in tokens if t in pt)
  2295. return hits / len(tokens)
  2296. @staticmethod
  2297. def normalize_against_product_text(
  2298. product_text: str,
  2299. mandatory_attrs: Dict[str, List[str]],
  2300. source_map: Dict[str, str],
  2301. threshold_abs: float = 0.65,
  2302. margin: float = 0.15,
  2303. allow_multiple: bool = False,
  2304. sem_weight: float = 0.8,
  2305. lex_weight: float = 0.2,
  2306. extracted_attrs: Optional[Dict[str, List[Dict[str, str]]]] = None,
  2307. relationships: Optional[Dict[str, float]] = None,
  2308. use_dynamic_thresholds: bool = True,
  2309. use_adaptive_margin: bool = True,
  2310. use_semantic_clustering: bool = True
  2311. ) -> dict:
  2312. """Score each allowed value against the product_text."""
  2313. if extracted_attrs is None:
  2314. extracted_attrs = {}
  2315. if relationships is None:
  2316. relationships = {}
  2317. pt_emb = EmbeddingCache.get_embedding(product_text, model_embedder)
  2318. extracted = {}
  2319. for attr, allowed_values in mandatory_attrs.items():
  2320. scores: List[Tuple[str, float]] = []
  2321. is_dimension_attr = any(keyword in attr.lower() for keyword in ['dimension', 'size', 'measurement'])
  2322. normalized_product_text = ProductAttributeService.normalize_dimension_text(product_text) if is_dimension_attr else ""
  2323. for val in allowed_values:
  2324. if is_dimension_attr:
  2325. normalized_val = ProductAttributeService.normalize_dimension_text(val)
  2326. if normalized_val and normalized_product_text and normalized_val == normalized_product_text:
  2327. scores.append((val, 1.0))
  2328. continue
  2329. if normalized_val:
  2330. val_numbers = normalized_val.split('x')
  2331. text_lower = product_text.lower()
  2332. if all(num in text_lower for num in val_numbers):
  2333. idx1 = text_lower.find(val_numbers[0])
  2334. idx2 = text_lower.find(val_numbers[1])
  2335. if idx1 != -1 and idx2 != -1:
  2336. distance = abs(idx2 - idx1)
  2337. if distance < 20:
  2338. scores.append((val, 0.95))
  2339. continue
  2340. contexts = [val, f"for {val}", f"use in {val}", f"suitable for {val}", f"{val} room"]
  2341. ctx_embs = [EmbeddingCache.get_embedding(c, model_embedder) for c in contexts]
  2342. sem_sim = max(float(util.cos_sim(pt_emb, ce).item()) for ce in ctx_embs)
  2343. lex_score = ProductAttributeService._lexical_evidence(product_text, val)
  2344. final_score = sem_weight * sem_sim + lex_weight * lex_score
  2345. scores.append((val, final_score))
  2346. scores.sort(key=lambda x: x[1], reverse=True)
  2347. best_val, best_score = scores[0]
  2348. effective_margin = margin
  2349. if allow_multiple and use_adaptive_margin:
  2350. effective_margin = ProductAttributeService.get_adaptive_margin(scores, margin)
  2351. if is_dimension_attr and best_score >= 0.90:
  2352. source = ProductAttributeService.find_value_source(best_val, source_map, attr)
  2353. extracted[attr] = [{"value": best_val, "source": source}]
  2354. continue
  2355. if not allow_multiple:
  2356. source = ProductAttributeService.find_value_source(best_val, source_map, attr)
  2357. extracted[attr] = [{"value": best_val, "source": source}]
  2358. else:
  2359. candidates = [best_val]
  2360. use_base_threshold = best_score >= threshold_abs
  2361. clusters = []
  2362. if use_semantic_clustering:
  2363. clusters = ProductAttributeService.calculate_value_clusters(
  2364. allowed_values, scores, cluster_threshold=0.4
  2365. )
  2366. best_cluster = next((c for c in clusters if best_val in c), [best_val])
  2367. for val, sc in scores[1:]:
  2368. min_score = 0.4 if is_dimension_attr else 0.3
  2369. if sc < min_score:
  2370. continue
  2371. if use_dynamic_thresholds and extracted_attrs:
  2372. dynamic_thresh = ProductAttributeService.get_dynamic_threshold(
  2373. attr, val, sc, extracted_attrs, relationships,
  2374. mandatory_attrs, threshold_abs
  2375. )
  2376. else:
  2377. dynamic_thresh = threshold_abs
  2378. within_margin = (best_score - sc) <= effective_margin
  2379. above_threshold = sc >= dynamic_thresh
  2380. in_cluster = False
  2381. if use_semantic_clustering and clusters:
  2382. in_cluster = any(best_val in c and val in c for c in clusters)
  2383. if use_base_threshold:
  2384. if above_threshold and within_margin:
  2385. candidates.append(val)
  2386. elif in_cluster and within_margin:
  2387. candidates.append(val)
  2388. else:
  2389. if within_margin:
  2390. candidates.append(val)
  2391. elif in_cluster and (best_score - sc) <= effective_margin * 2.0:
  2392. candidates.append(val)
  2393. extracted[attr] = []
  2394. for candidate in candidates:
  2395. source = ProductAttributeService.find_value_source(candidate, source_map, attr)
  2396. extracted[attr].append({"value": candidate, "source": source})
  2397. return extracted
  2398. @staticmethod
  2399. def extract_attributes(
  2400. product_text: str,
  2401. mandatory_attrs: Dict[str, List[str]],
  2402. source_map: Dict[str, str] = None,
  2403. model: str = None,
  2404. extract_additional: bool = True,
  2405. multiple: Optional[List[str]] = None,
  2406. threshold_abs: float = 0.65,
  2407. margin: float = 0.15,
  2408. use_dynamic_thresholds: bool = True,
  2409. use_adaptive_margin: bool = True,
  2410. use_semantic_clustering: bool = True,
  2411. use_cache: bool = True
  2412. ) -> dict:
  2413. """Extract attributes from product text using Groq LLM."""
  2414. if model is None:
  2415. model = settings.SUPPORTED_MODELS[0]
  2416. if multiple is None:
  2417. multiple = []
  2418. if source_map is None:
  2419. source_map = {}
  2420. if not product_text or product_text == "No product information available":
  2421. return ProductAttributeService._create_error_response(
  2422. "No product information provided",
  2423. mandatory_attrs,
  2424. extract_additional
  2425. )
  2426. # ⚡ CHECK CACHE FIRST
  2427. if use_cache:
  2428. cache_key = ProductAttributeService._generate_cache_key(product_text, mandatory_attrs)
  2429. cached_result = SimpleCache.get(cache_key)
  2430. if cached_result:
  2431. logger.info(f"✓ Cache hit")
  2432. return cached_result
  2433. mandatory_attr_list = []
  2434. for attr_name, allowed_values in mandatory_attrs.items():
  2435. mandatory_attr_list.append(f"{attr_name}: {', '.join(allowed_values)}")
  2436. mandatory_attr_text = "\n".join(mandatory_attr_list)
  2437. additional_instruction = ""
  2438. if extract_additional:
  2439. additional_instruction = """
  2440. 2. Extract ADDITIONAL attributes: Identify any other relevant attributes from the product text
  2441. that are NOT in the mandatory list. Only include attributes where you can find actual values
  2442. in the product text. Do NOT include attributes with "Not Specified" or empty values.
  2443. Examples of attributes to look for (only if present): Brand, Material, Size, Color, Dimensions,
  2444. Weight, Features, Style, Theme, Pattern, Finish, Care Instructions, etc."""
  2445. output_format = {
  2446. "mandatory": {attr: "value or list of values" for attr in mandatory_attrs.keys()},
  2447. }
  2448. if extract_additional:
  2449. output_format["additional"] = {
  2450. "example_attribute_1": "actual value found",
  2451. "example_attribute_2": "actual value found"
  2452. }
  2453. output_format["additional"]["_note"] = "Only include attributes with actual values found in text"
  2454. prompt = f"""
  2455. You are an intelligent product attribute extractor that works with ANY product type.
  2456. TASK:
  2457. 1. Extract MANDATORY attributes: For each mandatory attribute, select the most appropriate value(s)
  2458. from the provided list. Choose the value(s) that best match the product description.
  2459. {additional_instruction}
  2460. Product Text:
  2461. {product_text}
  2462. Mandatory Attribute Lists (MUST select from these allowed values):
  2463. {mandatory_attr_text}
  2464. CRITICAL INSTRUCTIONS:
  2465. - Return ONLY valid JSON, nothing else
  2466. - No explanations, no markdown, no text before or after the JSON
  2467. - For mandatory attributes, choose the value(s) from the provided list that best match
  2468. - If a mandatory attribute cannot be determined from the product text, use "Not Specified"
  2469. - Prefer exact matches from the allowed values list over generic synonyms
  2470. - If multiple values are plausible, you MAY return more than one
  2471. {f"- For additional attributes: ONLY include attributes where you found actual values in the product text. DO NOT include attributes with 'Not Specified', 'None', 'N/A', or empty values. If you cannot find a value for an attribute, simply don't include that attribute." if extract_additional else ""}
  2472. - Be precise and only extract information that is explicitly stated or clearly implied
  2473. Required Output Format:
  2474. {json.dumps(output_format, indent=2)}
  2475. """
  2476. payload = {
  2477. "model": model,
  2478. "messages": [
  2479. {
  2480. "role": "system",
  2481. "content": f"You are a precise attribute extraction model. Return ONLY valid JSON with {'mandatory and additional' if extract_additional else 'mandatory'} sections. No explanations, no markdown, no other text."
  2482. },
  2483. {"role": "user", "content": prompt}
  2484. ],
  2485. "temperature": 0.0,
  2486. "max_tokens": 1500
  2487. }
  2488. headers = {
  2489. "Authorization": f"Bearer {settings.GROQ_API_KEY}",
  2490. "Content-Type": "application/json",
  2491. }
  2492. try:
  2493. response = requests.post(
  2494. settings.GROQ_API_URL,
  2495. headers=headers,
  2496. json=payload,
  2497. timeout=30
  2498. )
  2499. response.raise_for_status()
  2500. result_text = response.json()["choices"][0]["message"]["content"].strip()
  2501. result_text = ProductAttributeService._clean_json_response(result_text)
  2502. parsed = json.loads(result_text)
  2503. parsed = ProductAttributeService._validate_response_structure(
  2504. parsed, mandatory_attrs, extract_additional, source_map
  2505. )
  2506. if extract_additional and "additional" in parsed:
  2507. cleaned_additional = {}
  2508. for k, v in parsed["additional"].items():
  2509. if v and v not in ["Not Specified", "None", "N/A", "", "not specified", "none", "n/a"]:
  2510. if not (isinstance(v, str) and v.lower() in ["not specified", "none", "n/a", ""]):
  2511. if isinstance(v, list):
  2512. cleaned_additional[k] = []
  2513. for item in v:
  2514. if isinstance(item, dict) and "value" in item:
  2515. if "source" not in item:
  2516. item["source"] = ProductAttributeService.find_value_source(
  2517. item["value"], source_map, k
  2518. )
  2519. cleaned_additional[k].append(item)
  2520. else:
  2521. source = ProductAttributeService.find_value_source(str(item), source_map, k)
  2522. cleaned_additional[k].append({"value": str(item), "source": source})
  2523. else:
  2524. source = ProductAttributeService.find_value_source(str(v), source_map, k)
  2525. cleaned_additional[k] = [{"value": str(v), "source": source}]
  2526. parsed["additional"] = cleaned_additional
  2527. relationships = {}
  2528. if use_dynamic_thresholds:
  2529. relationships = ProductAttributeService.calculate_attribute_relationships(
  2530. mandatory_attrs, product_text
  2531. )
  2532. extracted_so_far = {}
  2533. for attr in mandatory_attrs.keys():
  2534. allow_multiple = attr in multiple
  2535. result = ProductAttributeService.normalize_against_product_text(
  2536. product_text=product_text,
  2537. mandatory_attrs={attr: mandatory_attrs[attr]},
  2538. source_map=source_map,
  2539. threshold_abs=threshold_abs,
  2540. margin=margin,
  2541. allow_multiple=allow_multiple,
  2542. extracted_attrs=extracted_so_far,
  2543. relationships=relationships,
  2544. use_dynamic_thresholds=use_dynamic_thresholds,
  2545. use_adaptive_margin=use_adaptive_margin,
  2546. use_semantic_clustering=use_semantic_clustering
  2547. )
  2548. parsed["mandatory"][attr] = result[attr]
  2549. extracted_so_far[attr] = result[attr]
  2550. # ⚡ CACHE THE RESULT
  2551. if use_cache:
  2552. SimpleCache.set(cache_key, parsed)
  2553. return parsed
  2554. except requests.exceptions.RequestException as e:
  2555. logger.error(f"Request exception: {str(e)}")
  2556. return ProductAttributeService._create_error_response(
  2557. str(e), mandatory_attrs, extract_additional
  2558. )
  2559. except json.JSONDecodeError as e:
  2560. logger.error(f"JSON decode error: {str(e)}")
  2561. return ProductAttributeService._create_error_response(
  2562. f"Invalid JSON: {str(e)}", mandatory_attrs, extract_additional, result_text
  2563. )
  2564. except Exception as e:
  2565. logger.error(f"Unexpected error: {str(e)}")
  2566. return ProductAttributeService._create_error_response(
  2567. str(e), mandatory_attrs, extract_additional
  2568. )
  2569. @staticmethod
  2570. def _clean_json_response(text: str) -> str:
  2571. """Clean LLM response to extract valid JSON."""
  2572. start_idx = text.find('{')
  2573. end_idx = text.rfind('}')
  2574. if start_idx != -1 and end_idx != -1:
  2575. text = text[start_idx:end_idx + 1]
  2576. if "```json" in text:
  2577. text = text.split("```json")[1].split("```")[0].strip()
  2578. elif "```" in text:
  2579. text = text.split("```")[1].split("```")[0].strip()
  2580. if text.startswith("json"):
  2581. text = text[4:].strip()
  2582. return text
  2583. @staticmethod
  2584. def _validate_response_structure(
  2585. parsed: dict,
  2586. mandatory_attrs: Dict[str, List[str]],
  2587. extract_additional: bool,
  2588. source_map: Dict[str, str] = None
  2589. ) -> dict:
  2590. """Validate and fix the response structure."""
  2591. if source_map is None:
  2592. source_map = {}
  2593. expected_sections = ["mandatory"]
  2594. if extract_additional:
  2595. expected_sections.append("additional")
  2596. if not all(section in parsed for section in expected_sections):
  2597. if isinstance(parsed, dict):
  2598. mandatory_keys = set(mandatory_attrs.keys())
  2599. mandatory = {k: v for k, v in parsed.items() if k in mandatory_keys}
  2600. additional = {k: v for k, v in parsed.items() if k not in mandatory_keys}
  2601. result = {"mandatory": mandatory}
  2602. if extract_additional:
  2603. result["additional"] = additional
  2604. parsed = result
  2605. else:
  2606. return ProductAttributeService._create_error_response(
  2607. "Invalid response structure",
  2608. mandatory_attrs,
  2609. extract_additional,
  2610. str(parsed)
  2611. )
  2612. if "mandatory" in parsed:
  2613. converted_mandatory = {}
  2614. for attr, value in parsed["mandatory"].items():
  2615. if isinstance(value, list):
  2616. converted_mandatory[attr] = []
  2617. for item in value:
  2618. if isinstance(item, dict) and "value" in item:
  2619. if "source" not in item:
  2620. item["source"] = ProductAttributeService.find_value_source(
  2621. item["value"], source_map, attr
  2622. )
  2623. converted_mandatory[attr].append(item)
  2624. else:
  2625. source = ProductAttributeService.find_value_source(str(item), source_map, attr)
  2626. converted_mandatory[attr].append({"value": str(item), "source": source})
  2627. else:
  2628. source = ProductAttributeService.find_value_source(str(value), source_map, attr)
  2629. converted_mandatory[attr] = [{"value": str(value), "source": source}]
  2630. parsed["mandatory"] = converted_mandatory
  2631. return parsed
  2632. @staticmethod
  2633. def _create_error_response(
  2634. error: str,
  2635. mandatory_attrs: Dict[str, List[str]],
  2636. extract_additional: bool,
  2637. raw_output: Optional[str] = None
  2638. ) -> dict:
  2639. """Create a standardized error response."""
  2640. response = {
  2641. "mandatory": {attr: [{"value": "Not Specified", "source": "error"}] for attr in mandatory_attrs.keys()},
  2642. "error": error
  2643. }
  2644. if extract_additional:
  2645. response["additional"] = {}
  2646. if raw_output:
  2647. response["raw_output"] = raw_output
  2648. return response
  2649. @staticmethod
  2650. def get_cache_stats() -> Dict:
  2651. """Get statistics for both caches."""
  2652. return {
  2653. "simple_cache": SimpleCache.get_stats(),
  2654. "embedding_cache": EmbeddingCache.get_stats()
  2655. }
  2656. @staticmethod
  2657. def clear_all_caches():
  2658. """Clear both caches."""
  2659. SimpleCache.clear()
  2660. EmbeddingCache.clear()
  2661. logger.info("All caches cleared")