services.py 107 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699
  1. # # import requests
  2. # # import json
  3. # # from typing import Dict, List, Optional
  4. # # from django.conf import settings
  5. # # class ProductAttributeService:
  6. # # """Service class for extracting product attributes using Groq LLM."""
  7. # # @staticmethod
  8. # # def combine_product_text(
  9. # # title: Optional[str] = None,
  10. # # short_desc: Optional[str] = None,
  11. # # long_desc: Optional[str] = None
  12. # # ) -> str:
  13. # # """Combine product metadata into a single text block."""
  14. # # parts = []
  15. # # if title:
  16. # # parts.append(str(title).strip())
  17. # # if short_desc:
  18. # # parts.append(str(short_desc).strip())
  19. # # if long_desc:
  20. # # parts.append(str(long_desc).strip())
  21. # # combined = " ".join(parts).strip()
  22. # # if not combined:
  23. # # return "No product information available"
  24. # # return combined
  25. # # @staticmethod
  26. # # def extract_attributes(
  27. # # product_text: str,
  28. # # mandatory_attrs: Dict[str, List[str]],
  29. # # model: str = None,
  30. # # extract_additional: bool = True
  31. # # ) -> dict:
  32. # # """Use Groq LLM to extract attributes from any product type."""
  33. # # if model is None:
  34. # # model = settings.SUPPORTED_MODELS[0]
  35. # # # Check if product text is empty or minimal
  36. # # if not product_text or product_text == "No product information available":
  37. # # return ProductAttributeService._create_error_response(
  38. # # "No product information provided",
  39. # # mandatory_attrs,
  40. # # extract_additional
  41. # # )
  42. # # # Create structured prompt for mandatory attributes
  43. # # mandatory_attr_list = []
  44. # # for attr_name, allowed_values in mandatory_attrs.items():
  45. # # mandatory_attr_list.append(f"{attr_name}: {', '.join(allowed_values)}")
  46. # # mandatory_attr_text = "\n".join(mandatory_attr_list)
  47. # # additional_instruction = ""
  48. # # if extract_additional:
  49. # # additional_instruction = """
  50. # # 2. Extract ADDITIONAL attributes: Identify any other relevant attributes from the product text
  51. # # (such as Material, Size, Color, Brand, Dimensions, Weight, Features, Specifications, etc.)
  52. # # and their values. Extract attributes that are specific and relevant to this product type."""
  53. # # output_format = {
  54. # # "mandatory": {attr: "value" for attr in mandatory_attrs.keys()},
  55. # # "additional": {} if extract_additional else None
  56. # # }
  57. # # if not extract_additional:
  58. # # output_format.pop("additional")
  59. # # prompt = f"""
  60. # # You are an intelligent product attribute extractor that works with ANY product type.
  61. # # TASK:
  62. # # 1. Extract MANDATORY attributes: For each mandatory attribute, select the most appropriate value
  63. # # from the provided list. Choose the value that best matches the product description.
  64. # # {additional_instruction}
  65. # # Product Text:
  66. # # {product_text}
  67. # # Mandatory Attribute Lists (MUST select one value for each):
  68. # # {mandatory_attr_text}
  69. # # CRITICAL INSTRUCTIONS:
  70. # # - Return ONLY valid JSON, nothing else
  71. # # - No explanations, no markdown, no text before or after the JSON
  72. # # - For mandatory attributes, choose EXACTLY ONE value from the provided list that best matches
  73. # # - If a mandatory attribute cannot be determined from the product text, use "Not Specified"
  74. # # - Work with whatever information is available - the product text may be incomplete (only title, or only description, etc.)
  75. # # {f"- For additional attributes, extract any relevant information found in the product text" if extract_additional else ""}
  76. # # - Be precise and only extract information that is explicitly stated or clearly implied
  77. # # Required Output Format (ONLY THIS, NO OTHER TEXT):
  78. # # {json.dumps(output_format, indent=2)}
  79. # # """
  80. # # payload = {
  81. # # "model": model,
  82. # # "messages": [
  83. # # {
  84. # # "role": "system",
  85. # # "content": f"You are a precise attribute extraction model. Return ONLY valid JSON with {'mandatory and additional' if extract_additional else 'mandatory'} sections. No explanations, no markdown, no other text."
  86. # # },
  87. # # {"role": "user", "content": prompt}
  88. # # ],
  89. # # "temperature": 0.0,
  90. # # "max_tokens": 1500
  91. # # }
  92. # # headers = {
  93. # # "Authorization": f"Bearer {settings.GROQ_API_KEY}",
  94. # # "Content-Type": "application/json",
  95. # # }
  96. # # try:
  97. # # response = requests.post(
  98. # # settings.GROQ_API_URL,
  99. # # headers=headers,
  100. # # json=payload,
  101. # # timeout=30
  102. # # )
  103. # # response.raise_for_status()
  104. # # result_text = response.json()["choices"][0]["message"]["content"].strip()
  105. # # # Clean the response
  106. # # result_text = ProductAttributeService._clean_json_response(result_text)
  107. # # # Parse JSON
  108. # # parsed = json.loads(result_text)
  109. # # # Validate and restructure if needed
  110. # # parsed = ProductAttributeService._validate_response_structure(
  111. # # parsed, mandatory_attrs, extract_additional
  112. # # )
  113. # # return parsed
  114. # # except requests.exceptions.RequestException as e:
  115. # # return ProductAttributeService._create_error_response(
  116. # # str(e), mandatory_attrs, extract_additional
  117. # # )
  118. # # except json.JSONDecodeError as e:
  119. # # return ProductAttributeService._create_error_response(
  120. # # f"Invalid JSON: {str(e)}", mandatory_attrs, extract_additional, result_text
  121. # # )
  122. # # except Exception as e:
  123. # # return ProductAttributeService._create_error_response(
  124. # # str(e), mandatory_attrs, extract_additional
  125. # # )
  126. # # @staticmethod
  127. # # def _clean_json_response(text: str) -> str:
  128. # # """Clean LLM response to extract valid JSON."""
  129. # # start_idx = text.find('{')
  130. # # end_idx = text.rfind('}')
  131. # # if start_idx != -1 and end_idx != -1:
  132. # # text = text[start_idx:end_idx + 1]
  133. # # if "```json" in text:
  134. # # text = text.split("```json")[1].split("```")[0].strip()
  135. # # elif "```" in text:
  136. # # text = text.split("```")[1].split("```")[0].strip()
  137. # # if text.startswith("json"):
  138. # # text = text[4:].strip()
  139. # # return text
  140. # # @staticmethod
  141. # # def _validate_response_structure(
  142. # # parsed: dict,
  143. # # mandatory_attrs: Dict[str, List[str]],
  144. # # extract_additional: bool
  145. # # ) -> dict:
  146. # # """Validate and fix the response structure."""
  147. # # expected_sections = ["mandatory"]
  148. # # if extract_additional:
  149. # # expected_sections.append("additional")
  150. # # if not all(section in parsed for section in expected_sections):
  151. # # if isinstance(parsed, dict):
  152. # # mandatory_keys = set(mandatory_attrs.keys())
  153. # # mandatory = {k: v for k, v in parsed.items() if k in mandatory_keys}
  154. # # additional = {k: v for k, v in parsed.items() if k not in mandatory_keys}
  155. # # result = {"mandatory": mandatory}
  156. # # if extract_additional:
  157. # # result["additional"] = additional
  158. # # return result
  159. # # else:
  160. # # return ProductAttributeService._create_error_response(
  161. # # "Invalid response structure",
  162. # # mandatory_attrs,
  163. # # extract_additional,
  164. # # str(parsed)
  165. # # )
  166. # # return parsed
  167. # # @staticmethod
  168. # # def _create_error_response(
  169. # # error: str,
  170. # # mandatory_attrs: Dict[str, List[str]],
  171. # # extract_additional: bool,
  172. # # raw_output: Optional[str] = None
  173. # # ) -> dict:
  174. # # """Create a standardized error response."""
  175. # # response = {
  176. # # "mandatory": {attr: "Not Specified" for attr in mandatory_attrs.keys()},
  177. # # "error": error
  178. # # }
  179. # # if extract_additional:
  180. # # response["additional"] = {}
  181. # # if raw_output:
  182. # # response["raw_output"] = raw_output
  183. # # return response
  184. # import requests
  185. # import json
  186. # from typing import Dict, List, Optional
  187. # from django.conf import settings
  188. # from concurrent.futures import ThreadPoolExecutor, as_completed
  189. # class ProductAttributeService:
  190. # """Service class for extracting product attributes using Groq LLM."""
  191. # @staticmethod
  192. # def combine_product_text(
  193. # title: Optional[str] = None,
  194. # short_desc: Optional[str] = None,
  195. # long_desc: Optional[str] = None
  196. # ) -> str:
  197. # """Combine product metadata into a single text block."""
  198. # parts = []
  199. # if title:
  200. # parts.append(str(title).strip())
  201. # if short_desc:
  202. # parts.append(str(short_desc).strip())
  203. # if long_desc:
  204. # parts.append(str(long_desc).strip())
  205. # combined = " ".join(parts).strip()
  206. # if not combined:
  207. # return "No product information available"
  208. # return combined
  209. # @staticmethod
  210. # def extract_attributes(
  211. # product_text: str,
  212. # mandatory_attrs: Dict[str, List[str]],
  213. # model: str = None,
  214. # extract_additional: bool = True
  215. # ) -> dict:
  216. # """Use Groq LLM to extract attributes from any product type."""
  217. # if model is None:
  218. # model = settings.SUPPORTED_MODELS[0]
  219. # # Check if product text is empty or minimal
  220. # if not product_text or product_text == "No product information available":
  221. # return ProductAttributeService._create_error_response(
  222. # "No product information provided",
  223. # mandatory_attrs,
  224. # extract_additional
  225. # )
  226. # # Create structured prompt for mandatory attributes
  227. # mandatory_attr_list = []
  228. # for attr_name, allowed_values in mandatory_attrs.items():
  229. # mandatory_attr_list.append(f"{attr_name}: {', '.join(allowed_values)}")
  230. # mandatory_attr_text = "\n".join(mandatory_attr_list)
  231. # additional_instruction = ""
  232. # if extract_additional:
  233. # additional_instruction = """
  234. # 2. Extract ADDITIONAL attributes: Identify any other relevant attributes from the product text
  235. # (such as Material, Size, Color, Brand, Dimensions, Weight, Features, Specifications, etc.)
  236. # and their values. Extract attributes that are specific and relevant to this product type."""
  237. # output_format = {
  238. # "mandatory": {attr: "value" for attr in mandatory_attrs.keys()},
  239. # "additional": {} if extract_additional else None
  240. # }
  241. # if not extract_additional:
  242. # output_format.pop("additional")
  243. # prompt = f"""
  244. # You are an intelligent product attribute extractor that works with ANY product type.
  245. # TASK:
  246. # 1. Extract MANDATORY attributes: For each mandatory attribute, select the most appropriate value
  247. # from the provided list. Choose the value that best matches the product description.
  248. # {additional_instruction}
  249. # Product Text:
  250. # {product_text}
  251. # Mandatory Attribute Lists (MUST select one value for each):
  252. # {mandatory_attr_text}
  253. # CRITICAL INSTRUCTIONS:
  254. # - Return ONLY valid JSON, nothing else
  255. # - No explanations, no markdown, no text before or after the JSON
  256. # - For mandatory attributes, choose EXACTLY ONE value from the provided list that best matches
  257. # - If a mandatory attribute cannot be determined from the product text, use "Not Specified"
  258. # - Work with whatever information is available - the product text may be incomplete (only title, or only description, etc.)
  259. # {f"- For additional attributes, extract any relevant information found in the product text" if extract_additional else ""}
  260. # - Be precise and only extract information that is explicitly stated or clearly implied
  261. # Required Output Format (ONLY THIS, NO OTHER TEXT):
  262. # {json.dumps(output_format, indent=2)}
  263. # """
  264. # payload = {
  265. # "model": model,
  266. # "messages": [
  267. # {
  268. # "role": "system",
  269. # "content": f"You are a precise attribute extraction model. Return ONLY valid JSON with {'mandatory and additional' if extract_additional else 'mandatory'} sections. No explanations, no markdown, no other text."
  270. # },
  271. # {"role": "user", "content": prompt}
  272. # ],
  273. # "temperature": 0.0,
  274. # "max_tokens": 1500
  275. # }
  276. # headers = {
  277. # "Authorization": f"Bearer {settings.GROQ_API_KEY}",
  278. # "Content-Type": "application/json",
  279. # }
  280. # try:
  281. # response = requests.post(
  282. # settings.GROQ_API_URL,
  283. # headers=headers,
  284. # json=payload,
  285. # timeout=30
  286. # )
  287. # response.raise_for_status()
  288. # result_text = response.json()["choices"][0]["message"]["content"].strip()
  289. # # Clean the response
  290. # result_text = ProductAttributeService._clean_json_response(result_text)
  291. # # Parse JSON
  292. # parsed = json.loads(result_text)
  293. # # Validate and restructure if needed
  294. # parsed = ProductAttributeService._validate_response_structure(
  295. # parsed, mandatory_attrs, extract_additional
  296. # )
  297. # return parsed
  298. # except requests.exceptions.RequestException as e:
  299. # return ProductAttributeService._create_error_response(
  300. # str(e), mandatory_attrs, extract_additional
  301. # )
  302. # except json.JSONDecodeError as e:
  303. # return ProductAttributeService._create_error_response(
  304. # f"Invalid JSON: {str(e)}", mandatory_attrs, extract_additional, result_text
  305. # )
  306. # except Exception as e:
  307. # return ProductAttributeService._create_error_response(
  308. # str(e), mandatory_attrs, extract_additional
  309. # )
  310. # @staticmethod
  311. # def extract_attributes_batch(
  312. # products: List[Dict],
  313. # mandatory_attrs: Dict[str, List[str]],
  314. # model: str = None,
  315. # extract_additional: bool = True,
  316. # max_workers: int = 5
  317. # ) -> Dict:
  318. # """
  319. # Extract attributes for multiple products in parallel.
  320. # Args:
  321. # products: List of product dictionaries with keys: product_id, title, short_desc, long_desc
  322. # mandatory_attrs: Dictionary of mandatory attributes
  323. # model: Groq model to use
  324. # extract_additional: Whether to extract additional attributes
  325. # max_workers: Maximum number of parallel workers
  326. # Returns:
  327. # Dictionary with results, total_products, successful, and failed counts
  328. # """
  329. # results = []
  330. # successful = 0
  331. # failed = 0
  332. # def process_product(product_data):
  333. # """Process a single product."""
  334. # product_id = product_data.get('product_id', f"product_{len(results)}")
  335. # try:
  336. # product_text = ProductAttributeService.combine_product_text(
  337. # title=product_data.get('title'),
  338. # short_desc=product_data.get('short_desc'),
  339. # long_desc=product_data.get('long_desc')
  340. # )
  341. # result = ProductAttributeService.extract_attributes(
  342. # product_text=product_text,
  343. # mandatory_attrs=mandatory_attrs,
  344. # model=model,
  345. # extract_additional=extract_additional
  346. # )
  347. # result['product_id'] = product_id
  348. # # Check if extraction was successful
  349. # if 'error' not in result:
  350. # return result, True
  351. # else:
  352. # return result, False
  353. # except Exception as e:
  354. # return {
  355. # 'product_id': product_id,
  356. # 'mandatory': {attr: "Not Specified" for attr in mandatory_attrs.keys()},
  357. # 'additional': {} if extract_additional else None,
  358. # 'error': f"Processing error: {str(e)}"
  359. # }, False
  360. # # Process products in parallel
  361. # with ThreadPoolExecutor(max_workers=max_workers) as executor:
  362. # future_to_product = {
  363. # executor.submit(process_product, product): product
  364. # for product in products
  365. # }
  366. # for future in as_completed(future_to_product):
  367. # try:
  368. # result, success = future.result()
  369. # results.append(result)
  370. # if success:
  371. # successful += 1
  372. # else:
  373. # failed += 1
  374. # except Exception as e:
  375. # failed += 1
  376. # results.append({
  377. # 'product_id': 'unknown',
  378. # 'mandatory': {attr: "Not Specified" for attr in mandatory_attrs.keys()},
  379. # 'additional': {} if extract_additional else None,
  380. # 'error': f"Unexpected error: {str(e)}"
  381. # })
  382. # return {
  383. # 'results': results,
  384. # 'total_products': len(products),
  385. # 'successful': successful,
  386. # 'failed': failed
  387. # }
  388. # @staticmethod
  389. # def _clean_json_response(text: str) -> str:
  390. # """Clean LLM response to extract valid JSON."""
  391. # start_idx = text.find('{')
  392. # end_idx = text.rfind('}')
  393. # if start_idx != -1 and end_idx != -1:
  394. # text = text[start_idx:end_idx + 1]
  395. # if "```json" in text:
  396. # text = text.split("```json")[1].split("```")[0].strip()
  397. # elif "```" in text:
  398. # text = text.split("```")[1].split("```")[0].strip()
  399. # if text.startswith("json"):
  400. # text = text[4:].strip()
  401. # return text
  402. # @staticmethod
  403. # def _validate_response_structure(
  404. # parsed: dict,
  405. # mandatory_attrs: Dict[str, List[str]],
  406. # extract_additional: bool
  407. # ) -> dict:
  408. # """Validate and fix the response structure."""
  409. # expected_sections = ["mandatory"]
  410. # if extract_additional:
  411. # expected_sections.append("additional")
  412. # if not all(section in parsed for section in expected_sections):
  413. # if isinstance(parsed, dict):
  414. # mandatory_keys = set(mandatory_attrs.keys())
  415. # mandatory = {k: v for k, v in parsed.items() if k in mandatory_keys}
  416. # additional = {k: v for k, v in parsed.items() if k not in mandatory_keys}
  417. # result = {"mandatory": mandatory}
  418. # if extract_additional:
  419. # result["additional"] = additional
  420. # return result
  421. # else:
  422. # return ProductAttributeService._create_error_response(
  423. # "Invalid response structure",
  424. # mandatory_attrs,
  425. # extract_additional,
  426. # str(parsed)
  427. # )
  428. # return parsed
  429. # @staticmethod
  430. # def _create_error_response(
  431. # error: str,
  432. # mandatory_attrs: Dict[str, List[str]],
  433. # extract_additional: bool,
  434. # raw_output: Optional[str] = None
  435. # ) -> dict:
  436. # """Create a standardized error response."""
  437. # response = {
  438. # "mandatory": {attr: "Not Specified" for attr in mandatory_attrs.keys()},
  439. # "error": error
  440. # }
  441. # if extract_additional:
  442. # response["additional"] = {}
  443. # if raw_output:
  444. # response["raw_output"] = raw_output
  445. # return response
  446. # # ==================== services.py ====================
  447. # import requests
  448. # import json
  449. # from typing import Dict, List, Optional
  450. # from django.conf import settings
  451. # from concurrent.futures import ThreadPoolExecutor, as_completed
  452. # from .ocr_service import OCRService
  453. # class ProductAttributeService:
  454. # """Service class for extracting product attributes using Groq LLM."""
  455. # @staticmethod
  456. # def combine_product_text(
  457. # title: Optional[str] = None,
  458. # short_desc: Optional[str] = None,
  459. # long_desc: Optional[str] = None,
  460. # ocr_text: Optional[str] = None
  461. # ) -> str:
  462. # """Combine product metadata into a single text block."""
  463. # parts = []
  464. # if title:
  465. # parts.append(f"Title: {str(title).strip()}")
  466. # if short_desc:
  467. # parts.append(f"Description: {str(short_desc).strip()}")
  468. # if long_desc:
  469. # parts.append(f"Details: {str(long_desc).strip()}")
  470. # if ocr_text:
  471. # parts.append(f"OCR Text: {ocr_text}")
  472. # combined = "\n".join(parts).strip()
  473. # if not combined:
  474. # return "No product information available"
  475. # return combined
  476. # @staticmethod
  477. # def extract_attributes_from_ocr(ocr_results: Dict, model: str = None) -> Dict:
  478. # """Extract structured attributes from OCR text using LLM."""
  479. # if model is None:
  480. # model = settings.SUPPORTED_MODELS[0]
  481. # detected_text = ocr_results.get('detected_text', [])
  482. # if not detected_text:
  483. # return {}
  484. # # Format OCR text for prompt
  485. # ocr_text = "\n".join([f"Text: {item['text']}, Confidence: {item['confidence']:.2f}"
  486. # for item in detected_text])
  487. # prompt = f"""
  488. # You are an AI model that extracts structured attributes from OCR text detected on product images.
  489. # Given the OCR detections below, infer the possible product attributes and return them as a clean JSON object.
  490. # OCR Text:
  491. # {ocr_text}
  492. # Extract relevant attributes like:
  493. # - brand
  494. # - model_number
  495. # - size (waist_size, length, etc.)
  496. # - collection
  497. # - any other relevant product information
  498. # Return a JSON object with only the attributes you can confidently identify.
  499. # If an attribute is not present, do not include it in the response.
  500. # """
  501. # payload = {
  502. # "model": model,
  503. # "messages": [
  504. # {
  505. # "role": "system",
  506. # "content": "You are a helpful AI that extracts structured data from OCR output. Return only valid JSON."
  507. # },
  508. # {"role": "user", "content": prompt}
  509. # ],
  510. # "temperature": 0.2,
  511. # "max_tokens": 500
  512. # }
  513. # headers = {
  514. # "Authorization": f"Bearer {settings.GROQ_API_KEY}",
  515. # "Content-Type": "application/json",
  516. # }
  517. # try:
  518. # response = requests.post(
  519. # settings.GROQ_API_URL,
  520. # headers=headers,
  521. # json=payload,
  522. # timeout=30
  523. # )
  524. # response.raise_for_status()
  525. # result_text = response.json()["choices"][0]["message"]["content"].strip()
  526. # # Clean and parse JSON
  527. # result_text = ProductAttributeService._clean_json_response(result_text)
  528. # parsed = json.loads(result_text)
  529. # return parsed
  530. # except Exception as e:
  531. # return {"error": f"Failed to extract attributes from OCR: {str(e)}"}
  532. # @staticmethod
  533. # def extract_attributes(
  534. # product_text: str,
  535. # mandatory_attrs: Dict[str, List[str]],
  536. # model: str = None,
  537. # extract_additional: bool = True
  538. # ) -> dict:
  539. # """Use Groq LLM to extract attributes from any product type."""
  540. # if model is None:
  541. # model = settings.SUPPORTED_MODELS[0]
  542. # # Check if product text is empty or minimal
  543. # if not product_text or product_text == "No product information available":
  544. # return ProductAttributeService._create_error_response(
  545. # "No product information provided",
  546. # mandatory_attrs,
  547. # extract_additional
  548. # )
  549. # # Create structured prompt for mandatory attributes
  550. # mandatory_attr_list = []
  551. # for attr_name, allowed_values in mandatory_attrs.items():
  552. # mandatory_attr_list.append(f"{attr_name}: {', '.join(allowed_values)}")
  553. # mandatory_attr_text = "\n".join(mandatory_attr_list)
  554. # additional_instruction = ""
  555. # if extract_additional:
  556. # additional_instruction = """
  557. # 2. Extract ADDITIONAL attributes: Identify any other relevant attributes from the product text
  558. # (such as Material, Size, Color, Brand, Dimensions, Weight, Features, Specifications, etc.)
  559. # and their values. Extract attributes that are specific and relevant to this product type."""
  560. # output_format = {
  561. # "mandatory": {attr: "value" for attr in mandatory_attrs.keys()},
  562. # "additional": {} if extract_additional else None
  563. # }
  564. # if not extract_additional:
  565. # output_format.pop("additional")
  566. # prompt = f"""
  567. # You are an intelligent product attribute extractor that works with ANY product type.
  568. # TASK:
  569. # 1. Extract MANDATORY attributes: For each mandatory attribute, select the most appropriate value
  570. # from the provided list. Choose the value that best matches the product description.
  571. # {additional_instruction}
  572. # Product Text:
  573. # {product_text}
  574. # Mandatory Attribute Lists (MUST select one value for each):
  575. # {mandatory_attr_text}
  576. # CRITICAL INSTRUCTIONS:
  577. # - Return ONLY valid JSON, nothing else
  578. # - No explanations, no markdown, no text before or after the JSON
  579. # - For mandatory attributes, choose EXACTLY ONE value from the provided list that best matches
  580. # - If a mandatory attribute cannot be determined from the product text, use "Not Specified"
  581. # - Work with whatever information is available - the product text may be incomplete
  582. # {f"- For additional attributes, extract any relevant information found in the product text" if extract_additional else ""}
  583. # - Be precise and only extract information that is explicitly stated or clearly implied
  584. # Required Output Format (ONLY THIS, NO OTHER TEXT):
  585. # {json.dumps(output_format, indent=2)}
  586. # """
  587. # payload = {
  588. # "model": model,
  589. # "messages": [
  590. # {
  591. # "role": "system",
  592. # "content": f"You are a precise attribute extraction model. Return ONLY valid JSON with {'mandatory and additional' if extract_additional else 'mandatory'} sections. No explanations, no markdown, no other text."
  593. # },
  594. # {"role": "user", "content": prompt}
  595. # ],
  596. # "temperature": 0.0,
  597. # "max_tokens": 1500
  598. # }
  599. # headers = {
  600. # "Authorization": f"Bearer {settings.GROQ_API_KEY}",
  601. # "Content-Type": "application/json",
  602. # }
  603. # try:
  604. # response = requests.post(
  605. # settings.GROQ_API_URL,
  606. # headers=headers,
  607. # json=payload,
  608. # timeout=30
  609. # )
  610. # response.raise_for_status()
  611. # result_text = response.json()["choices"][0]["message"]["content"].strip()
  612. # # Clean the response
  613. # result_text = ProductAttributeService._clean_json_response(result_text)
  614. # # Parse JSON
  615. # parsed = json.loads(result_text)
  616. # # Validate and restructure if needed
  617. # parsed = ProductAttributeService._validate_response_structure(
  618. # parsed, mandatory_attrs, extract_additional
  619. # )
  620. # return parsed
  621. # except requests.exceptions.RequestException as e:
  622. # return ProductAttributeService._create_error_response(
  623. # str(e), mandatory_attrs, extract_additional
  624. # )
  625. # except json.JSONDecodeError as e:
  626. # return ProductAttributeService._create_error_response(
  627. # f"Invalid JSON: {str(e)}", mandatory_attrs, extract_additional, result_text
  628. # )
  629. # except Exception as e:
  630. # return ProductAttributeService._create_error_response(
  631. # str(e), mandatory_attrs, extract_additional
  632. # )
  633. # @staticmethod
  634. # def extract_attributes_batch(
  635. # products: List[Dict],
  636. # mandatory_attrs: Dict[str, List[str]],
  637. # model: str = None,
  638. # extract_additional: bool = True,
  639. # process_image: bool = True,
  640. # max_workers: int = 5
  641. # ) -> Dict:
  642. # """Extract attributes for multiple products in parallel."""
  643. # results = []
  644. # successful = 0
  645. # failed = 0
  646. # ocr_service = OCRService()
  647. # def process_product(product_data):
  648. # """Process a single product."""
  649. # product_id = product_data.get('product_id', f"product_{len(results)}")
  650. # try:
  651. # # Process image if URL is provided
  652. # ocr_results = None
  653. # ocr_text = None
  654. # if process_image and product_data.get('image_url'):
  655. # ocr_results = ocr_service.process_image(product_data['image_url'])
  656. # # Extract attributes from OCR
  657. # if ocr_results and ocr_results.get('detected_text'):
  658. # ocr_attrs = ProductAttributeService.extract_attributes_from_ocr(
  659. # ocr_results, model
  660. # )
  661. # ocr_results['extracted_attributes'] = ocr_attrs
  662. # # Format OCR text for combining with product text
  663. # ocr_text = "\n".join([
  664. # f"{item['text']} (confidence: {item['confidence']:.2f})"
  665. # for item in ocr_results['detected_text']
  666. # ])
  667. # # Combine all product information
  668. # product_text = ProductAttributeService.combine_product_text(
  669. # title=product_data.get('title'),
  670. # short_desc=product_data.get('short_desc'),
  671. # long_desc=product_data.get('long_desc'),
  672. # ocr_text=ocr_text
  673. # )
  674. # # Extract attributes from combined text
  675. # result = ProductAttributeService.extract_attributes(
  676. # product_text=product_text,
  677. # mandatory_attrs=mandatory_attrs,
  678. # model=model,
  679. # extract_additional=extract_additional
  680. # )
  681. # result['product_id'] = product_id
  682. # # Add OCR results if available
  683. # if ocr_results:
  684. # result['ocr_results'] = ocr_results
  685. # # Check if extraction was successful
  686. # if 'error' not in result:
  687. # return result, True
  688. # else:
  689. # return result, False
  690. # except Exception as e:
  691. # return {
  692. # 'product_id': product_id,
  693. # 'mandatory': {attr: "Not Specified" for attr in mandatory_attrs.keys()},
  694. # 'additional': {} if extract_additional else None,
  695. # 'error': f"Processing error: {str(e)}"
  696. # }, False
  697. # # Process products in parallel
  698. # with ThreadPoolExecutor(max_workers=max_workers) as executor:
  699. # future_to_product = {
  700. # executor.submit(process_product, product): product
  701. # for product in products
  702. # }
  703. # for future in as_completed(future_to_product):
  704. # try:
  705. # result, success = future.result()
  706. # results.append(result)
  707. # if success:
  708. # successful += 1
  709. # else:
  710. # failed += 1
  711. # except Exception as e:
  712. # failed += 1
  713. # results.append({
  714. # 'product_id': 'unknown',
  715. # 'mandatory': {attr: "Not Specified" for attr in mandatory_attrs.keys()},
  716. # 'additional': {} if extract_additional else None,
  717. # 'error': f"Unexpected error: {str(e)}"
  718. # })
  719. # return {
  720. # 'results': results,
  721. # 'total_products': len(products),
  722. # 'successful': successful,
  723. # 'failed': failed
  724. # }
  725. # @staticmethod
  726. # def _clean_json_response(text: str) -> str:
  727. # """Clean LLM response to extract valid JSON."""
  728. # start_idx = text.find('{')
  729. # end_idx = text.rfind('}')
  730. # if start_idx != -1 and end_idx != -1:
  731. # text = text[start_idx:end_idx + 1]
  732. # if "```json" in text:
  733. # text = text.split("```json")[1].split("```")[0].strip()
  734. # elif "```" in text:
  735. # text = text.split("```")[1].split("```")[0].strip()
  736. # if text.startswith("json"):
  737. # text = text[4:].strip()
  738. # return text
  739. # @staticmethod
  740. # def _validate_response_structure(
  741. # parsed: dict,
  742. # mandatory_attrs: Dict[str, List[str]],
  743. # extract_additional: bool
  744. # ) -> dict:
  745. # """Validate and fix the response structure."""
  746. # expected_sections = ["mandatory"]
  747. # if extract_additional:
  748. # expected_sections.append("additional")
  749. # if not all(section in parsed for section in expected_sections):
  750. # if isinstance(parsed, dict):
  751. # mandatory_keys = set(mandatory_attrs.keys())
  752. # mandatory = {k: v for k, v in parsed.items() if k in mandatory_keys}
  753. # additional = {k: v for k, v in parsed.items() if k not in mandatory_keys}
  754. # result = {"mandatory": mandatory}
  755. # if extract_additional:
  756. # result["additional"] = additional
  757. # return result
  758. # else:
  759. # return ProductAttributeService._create_error_response(
  760. # "Invalid response structure",
  761. # mandatory_attrs,
  762. # extract_additional,
  763. # str(parsed)
  764. # )
  765. # return parsed
  766. # @staticmethod
  767. # def _create_error_response(
  768. # error: str,
  769. # mandatory_attrs: Dict[str, List[str]],
  770. # extract_additional: bool,
  771. # raw_output: Optional[str] = None
  772. # ) -> dict:
  773. # """Create a standardized error response."""
  774. # response = {
  775. # "mandatory": {attr: "Not Specified" for attr in mandatory_attrs.keys()},
  776. # "error": error
  777. # }
  778. # if extract_additional:
  779. # response["additional"] = {}
  780. # if raw_output:
  781. # response["raw_output"] = raw_output
  782. # return response
  783. # # ==================== services.py ====================
  784. # import requests
  785. # import json
  786. # from typing import Dict, List, Optional, Tuple
  787. # from django.conf import settings
  788. # from concurrent.futures import ThreadPoolExecutor, as_completed
  789. # from sentence_transformers import SentenceTransformer, util
  790. # import numpy as np
  791. # from .ocr_service import OCRService
  792. # # Initialize embedding model for normalization
  793. # model_embedder = SentenceTransformer("all-MiniLM-L6-v2")
  794. # class ProductAttributeService:
  795. # """Service class for extracting product attributes using Groq LLM."""
  796. # @staticmethod
  797. # def combine_product_text(
  798. # title: Optional[str] = None,
  799. # short_desc: Optional[str] = None,
  800. # long_desc: Optional[str] = None,
  801. # ocr_text: Optional[str] = None
  802. # ) -> Tuple[str, Dict[str, str]]:
  803. # """
  804. # Combine product metadata into a single text block.
  805. # Returns: (combined_text, source_map) where source_map tracks which text came from where
  806. # """
  807. # parts = []
  808. # source_map = {}
  809. # if title:
  810. # title_str = str(title).strip()
  811. # parts.append(f"Title: {title_str}")
  812. # source_map['title'] = title_str
  813. # if short_desc:
  814. # short_str = str(short_desc).strip()
  815. # parts.append(f"Description: {short_str}")
  816. # source_map['short_desc'] = short_str
  817. # if long_desc:
  818. # long_str = str(long_desc).strip()
  819. # parts.append(f"Details: {long_str}")
  820. # source_map['long_desc'] = long_str
  821. # if ocr_text:
  822. # parts.append(f"OCR Text: {ocr_text}")
  823. # source_map['ocr_text'] = ocr_text
  824. # combined = "\n".join(parts).strip()
  825. # if not combined:
  826. # return "No product information available", {}
  827. # return combined, source_map
  828. # @staticmethod
  829. # def find_value_source(value: str, source_map: Dict[str, str]) -> str:
  830. # """
  831. # Find which source(s) contain the given value.
  832. # Returns the source name(s) where the value appears.
  833. # """
  834. # value_lower = value.lower()
  835. # # Split value into tokens for better matching
  836. # value_tokens = set(value_lower.replace("-", " ").split())
  837. # sources_found = []
  838. # source_scores = {}
  839. # for source_name, source_text in source_map.items():
  840. # source_lower = source_text.lower()
  841. # # Check for exact phrase match first
  842. # if value_lower in source_lower:
  843. # source_scores[source_name] = 1.0
  844. # continue
  845. # # Check for token matches
  846. # token_matches = sum(1 for token in value_tokens if token in source_lower)
  847. # if token_matches > 0:
  848. # source_scores[source_name] = token_matches / len(value_tokens)
  849. # # Return source with highest score, or all sources if multiple have same score
  850. # if source_scores:
  851. # max_score = max(source_scores.values())
  852. # sources_found = [s for s, score in source_scores.items() if score == max_score]
  853. # # Prioritize: title > short_desc > long_desc > ocr_text
  854. # priority = ['title', 'short_desc', 'long_desc', 'ocr_text']
  855. # for p in priority:
  856. # if p in sources_found:
  857. # return p
  858. # return sources_found[0] if sources_found else "Not found"
  859. # return "Not found"
  860. # @staticmethod
  861. # def extract_attributes_from_ocr(ocr_results: Dict, model: str = None) -> Dict:
  862. # """Extract structured attributes from OCR text using LLM."""
  863. # if model is None:
  864. # model = settings.SUPPORTED_MODELS[0]
  865. # detected_text = ocr_results.get('detected_text', [])
  866. # if not detected_text:
  867. # return {}
  868. # # Format OCR text for prompt
  869. # ocr_text = "\n".join([f"Text: {item['text']}, Confidence: {item['confidence']:.2f}"
  870. # for item in detected_text])
  871. # prompt = f"""
  872. # You are an AI model that extracts structured attributes from OCR text detected on product images.
  873. # Given the OCR detections below, infer the possible product attributes and return them as a clean JSON object.
  874. # OCR Text:
  875. # {ocr_text}
  876. # Extract relevant attributes like:
  877. # - brand
  878. # - model_number
  879. # - size (waist_size, length, etc.)
  880. # - collection
  881. # - any other relevant product information
  882. # Return a JSON object with only the attributes you can confidently identify.
  883. # If an attribute is not present, do not include it in the response.
  884. # """
  885. # payload = {
  886. # "model": model,
  887. # "messages": [
  888. # {
  889. # "role": "system",
  890. # "content": "You are a helpful AI that extracts structured data from OCR output. Return only valid JSON."
  891. # },
  892. # {"role": "user", "content": prompt}
  893. # ],
  894. # "temperature": 0.2,
  895. # "max_tokens": 500
  896. # }
  897. # headers = {
  898. # "Authorization": f"Bearer {settings.GROQ_API_KEY}",
  899. # "Content-Type": "application/json",
  900. # }
  901. # try:
  902. # response = requests.post(
  903. # settings.GROQ_API_URL,
  904. # headers=headers,
  905. # json=payload,
  906. # timeout=30
  907. # )
  908. # response.raise_for_status()
  909. # result_text = response.json()["choices"][0]["message"]["content"].strip()
  910. # # Clean and parse JSON
  911. # result_text = ProductAttributeService._clean_json_response(result_text)
  912. # parsed = json.loads(result_text)
  913. # return parsed
  914. # except Exception as e:
  915. # return {"error": f"Failed to extract attributes from OCR: {str(e)}"}
  916. # @staticmethod
  917. # def calculate_attribute_relationships(
  918. # mandatory_attrs: Dict[str, List[str]],
  919. # product_text: str
  920. # ) -> Dict[str, float]:
  921. # """
  922. # Calculate semantic relationships between attribute values across different attributes.
  923. # Returns a matrix of cross-attribute value similarities.
  924. # """
  925. # pt_emb = model_embedder.encode(product_text, convert_to_tensor=True)
  926. # # Calculate similarities between all attribute values and product text
  927. # attr_scores = {}
  928. # for attr, values in mandatory_attrs.items():
  929. # attr_scores[attr] = {}
  930. # for val in values:
  931. # contexts = [val, f"for {val}", f"use in {val}", f"suitable for {val}"]
  932. # ctx_embs = [model_embedder.encode(c, convert_to_tensor=True) for c in contexts]
  933. # sem_sim = max(float(util.cos_sim(pt_emb, ce).item()) for ce in ctx_embs)
  934. # attr_scores[attr][val] = sem_sim
  935. # # Calculate cross-attribute value relationships
  936. # relationships = {}
  937. # attr_list = list(mandatory_attrs.keys())
  938. # for i, attr1 in enumerate(attr_list):
  939. # for attr2 in attr_list[i+1:]:
  940. # # Calculate pairwise similarities between values of different attributes
  941. # for val1 in mandatory_attrs[attr1]:
  942. # for val2 in mandatory_attrs[attr2]:
  943. # emb1 = model_embedder.encode(val1, convert_to_tensor=True)
  944. # emb2 = model_embedder.encode(val2, convert_to_tensor=True)
  945. # sim = float(util.cos_sim(emb1, emb2).item())
  946. # # Store bidirectional relationships
  947. # key1 = f"{attr1}:{val1}->{attr2}:{val2}"
  948. # key2 = f"{attr2}:{val2}->{attr1}:{val1}"
  949. # relationships[key1] = sim
  950. # relationships[key2] = sim
  951. # return relationships
  952. # @staticmethod
  953. # def calculate_value_clusters(
  954. # values: List[str],
  955. # scores: List[Tuple[str, float]],
  956. # cluster_threshold: float = 0.4
  957. # ) -> List[List[str]]:
  958. # """
  959. # Group values into semantic clusters based on their similarity to each other.
  960. # Returns clusters of related values.
  961. # """
  962. # if len(values) <= 1:
  963. # return [[val] for val, _ in scores]
  964. # # Get embeddings for all values
  965. # embeddings = [model_embedder.encode(val, convert_to_tensor=True) for val in values]
  966. # # Calculate pairwise similarities
  967. # similarity_matrix = np.zeros((len(values), len(values)))
  968. # for i in range(len(values)):
  969. # for j in range(i+1, len(values)):
  970. # sim = float(util.cos_sim(embeddings[i], embeddings[j]).item())
  971. # similarity_matrix[i][j] = sim
  972. # similarity_matrix[j][i] = sim
  973. # # Simple clustering: group values with high similarity
  974. # clusters = []
  975. # visited = set()
  976. # for i, (val, score) in enumerate(scores):
  977. # if i in visited:
  978. # continue
  979. # cluster = [val]
  980. # visited.add(i)
  981. # # Find similar values
  982. # for j in range(len(values)):
  983. # if j not in visited and similarity_matrix[i][j] >= cluster_threshold:
  984. # cluster.append(values[j])
  985. # visited.add(j)
  986. # clusters.append(cluster)
  987. # return clusters
  988. # @staticmethod
  989. # def get_dynamic_threshold(
  990. # attr: str,
  991. # val: str,
  992. # base_score: float,
  993. # extracted_attrs: Dict[str, List[Dict[str, str]]],
  994. # relationships: Dict[str, float],
  995. # mandatory_attrs: Dict[str, List[str]],
  996. # base_threshold: float = 0.65,
  997. # boost_factor: float = 0.15
  998. # ) -> float:
  999. # """
  1000. # Calculate dynamic threshold based on relationships with already-extracted attributes.
  1001. # """
  1002. # threshold = base_threshold
  1003. # # Check relationships with already extracted attributes
  1004. # max_relationship = 0.0
  1005. # for other_attr, other_values_list in extracted_attrs.items():
  1006. # if other_attr == attr:
  1007. # continue
  1008. # for other_val_dict in other_values_list:
  1009. # other_val = other_val_dict['value']
  1010. # key = f"{attr}:{val}->{other_attr}:{other_val}"
  1011. # if key in relationships:
  1012. # max_relationship = max(max_relationship, relationships[key])
  1013. # # If strong relationship exists, lower threshold
  1014. # if max_relationship > 0.6:
  1015. # threshold = base_threshold - (boost_factor * max_relationship)
  1016. # return max(0.3, threshold)
  1017. # @staticmethod
  1018. # def get_adaptive_margin(
  1019. # scores: List[Tuple[str, float]],
  1020. # base_margin: float = 0.15,
  1021. # max_margin: float = 0.22
  1022. # ) -> float:
  1023. # """
  1024. # Calculate adaptive margin based on score distribution.
  1025. # """
  1026. # if len(scores) < 2:
  1027. # return base_margin
  1028. # score_values = [s for _, s in scores]
  1029. # best_score = score_values[0]
  1030. # # If best score is very low, use adaptive margin but be more conservative
  1031. # if best_score < 0.5:
  1032. # # Calculate score spread in top 3-4 scores only (more selective)
  1033. # top_scores = score_values[:min(4, len(score_values))]
  1034. # score_range = max(top_scores) - min(top_scores)
  1035. # # Very controlled margin increase
  1036. # if score_range < 0.30:
  1037. # # Much more conservative scaling
  1038. # score_factor = (0.5 - best_score) * 0.35
  1039. # adaptive = base_margin + score_factor + (0.30 - score_range) * 0.2
  1040. # return min(adaptive, max_margin)
  1041. # return base_margin
  1042. # @staticmethod
  1043. # def _lexical_evidence(product_text: str, label: str) -> float:
  1044. # """Calculate lexical overlap between product text and label."""
  1045. # pt = product_text.lower()
  1046. # tokens = [t for t in label.lower().replace("-", " ").split() if t]
  1047. # if not tokens:
  1048. # return 0.0
  1049. # hits = sum(1 for t in tokens if t in pt)
  1050. # return hits / len(tokens)
  1051. # @staticmethod
  1052. # def normalize_against_product_text(
  1053. # product_text: str,
  1054. # mandatory_attrs: Dict[str, List[str]],
  1055. # source_map: Dict[str, str],
  1056. # threshold_abs: float = 0.65,
  1057. # margin: float = 0.15,
  1058. # allow_multiple: bool = False,
  1059. # sem_weight: float = 0.8,
  1060. # lex_weight: float = 0.2,
  1061. # extracted_attrs: Optional[Dict[str, List[Dict[str, str]]]] = None,
  1062. # relationships: Optional[Dict[str, float]] = None,
  1063. # use_dynamic_thresholds: bool = True,
  1064. # use_adaptive_margin: bool = True,
  1065. # use_semantic_clustering: bool = True
  1066. # ) -> dict:
  1067. # """
  1068. # Score each allowed value against the product_text with dynamic thresholds.
  1069. # Returns dict with values in array format: [{"value": "...", "source": "..."}]
  1070. # """
  1071. # if extracted_attrs is None:
  1072. # extracted_attrs = {}
  1073. # if relationships is None:
  1074. # relationships = {}
  1075. # pt_emb = model_embedder.encode(product_text, convert_to_tensor=True)
  1076. # extracted = {}
  1077. # for attr, allowed_values in mandatory_attrs.items():
  1078. # scores: List[Tuple[str, float]] = []
  1079. # for val in allowed_values:
  1080. # contexts = [val, f"for {val}", f"use in {val}", f"suitable for {val}", f"{val} room"]
  1081. # ctx_embs = [model_embedder.encode(c, convert_to_tensor=True) for c in contexts]
  1082. # sem_sim = max(float(util.cos_sim(pt_emb, ce).item()) for ce in ctx_embs)
  1083. # lex_score = ProductAttributeService._lexical_evidence(product_text, val)
  1084. # final_score = sem_weight * sem_sim + lex_weight * lex_score
  1085. # scores.append((val, final_score))
  1086. # scores.sort(key=lambda x: x[1], reverse=True)
  1087. # best_val, best_score = scores[0]
  1088. # # Calculate adaptive margin if enabled
  1089. # effective_margin = margin
  1090. # if allow_multiple and use_adaptive_margin:
  1091. # effective_margin = ProductAttributeService.get_adaptive_margin(scores, margin)
  1092. # if not allow_multiple:
  1093. # source = ProductAttributeService.find_value_source(best_val, source_map)
  1094. # extracted[attr] = [{"value": best_val, "source": source}]
  1095. # else:
  1096. # candidates = [best_val]
  1097. # use_base_threshold = best_score >= threshold_abs
  1098. # # Get semantic clusters if enabled
  1099. # clusters = []
  1100. # if use_semantic_clustering:
  1101. # clusters = ProductAttributeService.calculate_value_clusters(
  1102. # allowed_values, scores, cluster_threshold=0.4
  1103. # )
  1104. # best_cluster = next((c for c in clusters if best_val in c), [best_val])
  1105. # for val, sc in scores[1:]:
  1106. # # Calculate dynamic threshold for this value
  1107. # if use_dynamic_thresholds and extracted_attrs:
  1108. # dynamic_thresh = ProductAttributeService.get_dynamic_threshold(
  1109. # attr, val, sc, extracted_attrs, relationships,
  1110. # mandatory_attrs, threshold_abs
  1111. # )
  1112. # else:
  1113. # dynamic_thresh = threshold_abs
  1114. # within_margin = (best_score - sc) <= effective_margin
  1115. # above_threshold = sc >= dynamic_thresh
  1116. # # Check if in same semantic cluster as best value
  1117. # in_cluster = False
  1118. # if use_semantic_clustering and clusters:
  1119. # in_cluster = any(best_val in c and val in c for c in clusters)
  1120. # if use_base_threshold:
  1121. # # Best score is good, require threshold OR (cluster + margin)
  1122. # if above_threshold and within_margin:
  1123. # candidates.append(val)
  1124. # elif in_cluster and within_margin:
  1125. # candidates.append(val)
  1126. # else:
  1127. # # Best score is low, use margin OR cluster logic
  1128. # if within_margin:
  1129. # candidates.append(val)
  1130. # elif in_cluster and (best_score - sc) <= effective_margin * 2.0:
  1131. # # Extended margin for cluster members
  1132. # candidates.append(val)
  1133. # # Map each candidate to its source and create array format
  1134. # extracted[attr] = []
  1135. # for candidate in candidates:
  1136. # source = ProductAttributeService.find_value_source(candidate, source_map)
  1137. # extracted[attr].append({"value": candidate, "source": source})
  1138. # return extracted
  1139. # @staticmethod
  1140. # def extract_attributes(
  1141. # product_text: str,
  1142. # mandatory_attrs: Dict[str, List[str]],
  1143. # source_map: Dict[str, str] = None,
  1144. # model: str = None,
  1145. # extract_additional: bool = True,
  1146. # multiple: Optional[List[str]] = None,
  1147. # threshold_abs: float = 0.65,
  1148. # margin: float = 0.15,
  1149. # use_dynamic_thresholds: bool = True,
  1150. # use_adaptive_margin: bool = True,
  1151. # use_semantic_clustering: bool = True
  1152. # ) -> dict:
  1153. # """
  1154. # Use Groq LLM to extract attributes from any product type with enhanced multi-value selection.
  1155. # Now returns values in array format: [{"value": "...", "source": "..."}]
  1156. # """
  1157. # if model is None:
  1158. # model = settings.SUPPORTED_MODELS[0]
  1159. # if multiple is None:
  1160. # multiple = []
  1161. # if source_map is None:
  1162. # source_map = {}
  1163. # # Check if product text is empty or minimal
  1164. # if not product_text or product_text == "No product information available":
  1165. # return ProductAttributeService._create_error_response(
  1166. # "No product information provided",
  1167. # mandatory_attrs,
  1168. # extract_additional
  1169. # )
  1170. # # Create structured prompt for mandatory attributes
  1171. # mandatory_attr_list = []
  1172. # for attr_name, allowed_values in mandatory_attrs.items():
  1173. # mandatory_attr_list.append(f"{attr_name}: {', '.join(allowed_values)}")
  1174. # mandatory_attr_text = "\n".join(mandatory_attr_list)
  1175. # additional_instruction = ""
  1176. # if extract_additional:
  1177. # additional_instruction = """
  1178. # 2. Extract ADDITIONAL attributes: Identify any other relevant attributes from the product text
  1179. # that are NOT in the mandatory list. Only include attributes where you can find actual values
  1180. # in the product text. Do NOT include attributes with "Not Specified" or empty values.
  1181. # Examples of attributes to look for (only if present): Brand, Material, Size, Color, Dimensions,
  1182. # Weight, Features, Style, Theme, Pattern, Finish, Care Instructions, etc."""
  1183. # output_format = {
  1184. # "mandatory": {attr: "value or list of values" for attr in mandatory_attrs.keys()},
  1185. # }
  1186. # if extract_additional:
  1187. # output_format["additional"] = {
  1188. # "example_attribute_1": "actual value found",
  1189. # "example_attribute_2": "actual value found"
  1190. # }
  1191. # output_format["additional"]["_note"] = "Only include attributes with actual values found in text"
  1192. # prompt = f"""
  1193. # You are an intelligent product attribute extractor that works with ANY product type.
  1194. # TASK:
  1195. # 1. Extract MANDATORY attributes: For each mandatory attribute, select the most appropriate value(s)
  1196. # from the provided list. Choose the value(s) that best match the product description.
  1197. # {additional_instruction}
  1198. # Product Text:
  1199. # {product_text}
  1200. # Mandatory Attribute Lists (MUST select from these allowed values):
  1201. # {mandatory_attr_text}
  1202. # CRITICAL INSTRUCTIONS:
  1203. # - Return ONLY valid JSON, nothing else
  1204. # - No explanations, no markdown, no text before or after the JSON
  1205. # - For mandatory attributes, choose the value(s) from the provided list that best match
  1206. # - If a mandatory attribute cannot be determined from the product text, use "Not Specified"
  1207. # - Prefer exact matches from the allowed values list over generic synonyms
  1208. # - If multiple values are plausible, you MAY return more than one
  1209. # {f"- For additional attributes: ONLY include attributes where you found actual values in the product text. DO NOT include attributes with 'Not Specified', 'None', 'N/A', or empty values. If you cannot find a value for an attribute, simply don't include that attribute." if extract_additional else ""}
  1210. # - Be precise and only extract information that is explicitly stated or clearly implied
  1211. # Required Output Format:
  1212. # {json.dumps(output_format, indent=2)}
  1213. # """
  1214. # payload = {
  1215. # "model": model,
  1216. # "messages": [
  1217. # {
  1218. # "role": "system",
  1219. # "content": f"You are a precise attribute extraction model. Return ONLY valid JSON with {'mandatory and additional' if extract_additional else 'mandatory'} sections. No explanations, no markdown, no other text."
  1220. # },
  1221. # {"role": "user", "content": prompt}
  1222. # ],
  1223. # "temperature": 0.0,
  1224. # "max_tokens": 1500
  1225. # }
  1226. # headers = {
  1227. # "Authorization": f"Bearer {settings.GROQ_API_KEY}",
  1228. # "Content-Type": "application/json",
  1229. # }
  1230. # try:
  1231. # response = requests.post(
  1232. # settings.GROQ_API_URL,
  1233. # headers=headers,
  1234. # json=payload,
  1235. # timeout=30
  1236. # )
  1237. # response.raise_for_status()
  1238. # result_text = response.json()["choices"][0]["message"]["content"].strip()
  1239. # # Clean the response
  1240. # result_text = ProductAttributeService._clean_json_response(result_text)
  1241. # # Parse JSON
  1242. # parsed = json.loads(result_text)
  1243. # # Validate and restructure with source tracking
  1244. # parsed = ProductAttributeService._validate_response_structure(
  1245. # parsed, mandatory_attrs, extract_additional, source_map
  1246. # )
  1247. # # Clean up and add source tracking to additional attributes in array format
  1248. # if extract_additional and "additional" in parsed:
  1249. # cleaned_additional = {}
  1250. # for k, v in parsed["additional"].items():
  1251. # if v and v not in ["Not Specified", "None", "N/A", "", "not specified", "none", "n/a"]:
  1252. # if not (isinstance(v, str) and v.lower() in ["not specified", "none", "n/a", ""]):
  1253. # # Convert to array format if not already
  1254. # if isinstance(v, list):
  1255. # cleaned_additional[k] = []
  1256. # for item in v:
  1257. # if isinstance(item, dict) and "value" in item:
  1258. # if "source" not in item:
  1259. # item["source"] = ProductAttributeService.find_value_source(
  1260. # item["value"], source_map
  1261. # )
  1262. # cleaned_additional[k].append(item)
  1263. # else:
  1264. # source = ProductAttributeService.find_value_source(str(item), source_map)
  1265. # cleaned_additional[k].append({"value": str(item), "source": source})
  1266. # else:
  1267. # source = ProductAttributeService.find_value_source(str(v), source_map)
  1268. # cleaned_additional[k] = [{"value": str(v), "source": source}]
  1269. # parsed["additional"] = cleaned_additional
  1270. # # Calculate attribute relationships if using dynamic thresholds
  1271. # relationships = {}
  1272. # if use_dynamic_thresholds:
  1273. # relationships = ProductAttributeService.calculate_attribute_relationships(
  1274. # mandatory_attrs, product_text
  1275. # )
  1276. # # Process attributes in order, allowing earlier ones to influence later ones
  1277. # extracted_so_far = {}
  1278. # for attr in mandatory_attrs.keys():
  1279. # allow_multiple = attr in multiple
  1280. # result = ProductAttributeService.normalize_against_product_text(
  1281. # product_text=product_text,
  1282. # mandatory_attrs={attr: mandatory_attrs[attr]},
  1283. # source_map=source_map,
  1284. # threshold_abs=threshold_abs,
  1285. # margin=margin,
  1286. # allow_multiple=allow_multiple,
  1287. # extracted_attrs=extracted_so_far,
  1288. # relationships=relationships,
  1289. # use_dynamic_thresholds=use_dynamic_thresholds,
  1290. # use_adaptive_margin=use_adaptive_margin,
  1291. # use_semantic_clustering=use_semantic_clustering
  1292. # )
  1293. # # Result is already in array format from normalize_against_product_text
  1294. # parsed["mandatory"][attr] = result[attr]
  1295. # extracted_so_far[attr] = result[attr]
  1296. # return parsed
  1297. # except requests.exceptions.RequestException as e:
  1298. # return ProductAttributeService._create_error_response(
  1299. # str(e), mandatory_attrs, extract_additional
  1300. # )
  1301. # except json.JSONDecodeError as e:
  1302. # return ProductAttributeService._create_error_response(
  1303. # f"Invalid JSON: {str(e)}", mandatory_attrs, extract_additional, result_text
  1304. # )
  1305. # except Exception as e:
  1306. # return ProductAttributeService._create_error_response(
  1307. # str(e), mandatory_attrs, extract_additional
  1308. # )
  1309. # @staticmethod
  1310. # def extract_attributes_batch(
  1311. # products: List[Dict],
  1312. # mandatory_attrs: Dict[str, List[str]],
  1313. # model: str = None,
  1314. # extract_additional: bool = True,
  1315. # process_image: bool = True,
  1316. # max_workers: int = 5,
  1317. # multiple: Optional[List[str]] = None,
  1318. # threshold_abs: float = 0.65,
  1319. # margin: float = 0.15,
  1320. # use_dynamic_thresholds: bool = True,
  1321. # use_adaptive_margin: bool = True,
  1322. # use_semantic_clustering: bool = True
  1323. # ) -> Dict:
  1324. # """Extract attributes for multiple products in parallel with enhanced multi-value selection and source tracking."""
  1325. # results = []
  1326. # successful = 0
  1327. # failed = 0
  1328. # ocr_service = OCRService()
  1329. # if multiple is None:
  1330. # multiple = []
  1331. # def process_product(product_data):
  1332. # """Process a single product."""
  1333. # product_id = product_data.get('product_id', f"product_{len(results)}")
  1334. # try:
  1335. # # Process image if URL is provided
  1336. # ocr_results = None
  1337. # ocr_text = None
  1338. # if process_image and product_data.get('image_url'):
  1339. # ocr_results = ocr_service.process_image(product_data['image_url'])
  1340. # # Extract attributes from OCR
  1341. # if ocr_results and ocr_results.get('detected_text'):
  1342. # ocr_attrs = ProductAttributeService.extract_attributes_from_ocr(
  1343. # ocr_results, model
  1344. # )
  1345. # ocr_results['extracted_attributes'] = ocr_attrs
  1346. # # Format OCR text for combining with product text
  1347. # ocr_text = "\n".join([
  1348. # f"{item['text']} (confidence: {item['confidence']:.2f})"
  1349. # for item in ocr_results['detected_text']
  1350. # ])
  1351. # # Combine all product information with source tracking
  1352. # product_text, source_map = ProductAttributeService.combine_product_text(
  1353. # title=product_data.get('title'),
  1354. # short_desc=product_data.get('short_desc'),
  1355. # long_desc=product_data.get('long_desc'),
  1356. # ocr_text=ocr_text
  1357. # )
  1358. # # Extract attributes from combined text with enhanced features
  1359. # result = ProductAttributeService.extract_attributes(
  1360. # product_text=product_text,
  1361. # mandatory_attrs=mandatory_attrs,
  1362. # source_map=source_map,
  1363. # model=model,
  1364. # extract_additional=extract_additional,
  1365. # multiple=multiple,
  1366. # threshold_abs=threshold_abs,
  1367. # margin=margin,
  1368. # use_dynamic_thresholds=use_dynamic_thresholds,
  1369. # use_adaptive_margin=use_adaptive_margin,
  1370. # use_semantic_clustering=use_semantic_clustering
  1371. # )
  1372. # result['product_id'] = product_id
  1373. # # Add OCR results if available
  1374. # if ocr_results:
  1375. # result['ocr_results'] = ocr_results
  1376. # # Check if extraction was successful
  1377. # if 'error' not in result:
  1378. # return result, True
  1379. # else:
  1380. # return result, False
  1381. # except Exception as e:
  1382. # return {
  1383. # 'product_id': product_id,
  1384. # 'mandatory': {attr: [{"value": "Not Specified", "source": "error"}] for attr in mandatory_attrs.keys()},
  1385. # 'additional': {} if extract_additional else None,
  1386. # 'error': f"Processing error: {str(e)}"
  1387. # }, False
  1388. # # Process products in parallel
  1389. # with ThreadPoolExecutor(max_workers=max_workers) as executor:
  1390. # future_to_product = {
  1391. # executor.submit(process_product, product): product
  1392. # for product in products
  1393. # }
  1394. # for future in as_completed(future_to_product):
  1395. # try:
  1396. # result, success = future.result()
  1397. # results.append(result)
  1398. # if success:
  1399. # successful += 1
  1400. # else:
  1401. # failed += 1
  1402. # except Exception as e:
  1403. # failed += 1
  1404. # results.append({
  1405. # 'product_id': 'unknown',
  1406. # 'mandatory': {attr: [{"value": "Not Specified", "source": "error"}] for attr in mandatory_attrs.keys()},
  1407. # 'additional': {} if extract_additional else None,
  1408. # 'error': f"Unexpected error: {str(e)}"
  1409. # })
  1410. # return {
  1411. # 'results': results,
  1412. # 'total_products': len(products),
  1413. # 'successful': successful,
  1414. # 'failed': failed
  1415. # }
  1416. # @staticmethod
  1417. # def _clean_json_response(text: str) -> str:
  1418. # """Clean LLM response to extract valid JSON."""
  1419. # start_idx = text.find('{')
  1420. # end_idx = text.rfind('}')
  1421. # if start_idx != -1 and end_idx != -1:
  1422. # text = text[start_idx:end_idx + 1]
  1423. # if "```json" in text:
  1424. # text = text.split("```json")[1].split("```")[0].strip()
  1425. # elif "```" in text:
  1426. # text = text.split("```")[1].split("```")[0].strip()
  1427. # if text.startswith("json"):
  1428. # text = text[4:].strip()
  1429. # return text
  1430. # @staticmethod
  1431. # def _validate_response_structure(
  1432. # parsed: dict,
  1433. # mandatory_attrs: Dict[str, List[str]],
  1434. # extract_additional: bool,
  1435. # source_map: Dict[str, str] = None
  1436. # ) -> dict:
  1437. # """Validate and fix the response structure, ensuring array format with source tracking."""
  1438. # if source_map is None:
  1439. # source_map = {}
  1440. # expected_sections = ["mandatory"]
  1441. # if extract_additional:
  1442. # expected_sections.append("additional")
  1443. # if not all(section in parsed for section in expected_sections):
  1444. # if isinstance(parsed, dict):
  1445. # mandatory_keys = set(mandatory_attrs.keys())
  1446. # mandatory = {k: v for k, v in parsed.items() if k in mandatory_keys}
  1447. # additional = {k: v for k, v in parsed.items() if k not in mandatory_keys}
  1448. # result = {"mandatory": mandatory}
  1449. # if extract_additional:
  1450. # result["additional"] = additional
  1451. # parsed = result
  1452. # else:
  1453. # return ProductAttributeService._create_error_response(
  1454. # "Invalid response structure",
  1455. # mandatory_attrs,
  1456. # extract_additional,
  1457. # str(parsed)
  1458. # )
  1459. # # Convert mandatory attributes to array format with source tracking
  1460. # if "mandatory" in parsed:
  1461. # converted_mandatory = {}
  1462. # for attr, value in parsed["mandatory"].items():
  1463. # if isinstance(value, list):
  1464. # # Already in array format, ensure each item has source
  1465. # converted_mandatory[attr] = []
  1466. # for item in value:
  1467. # if isinstance(item, dict) and "value" in item:
  1468. # # Already has proper structure
  1469. # if "source" not in item:
  1470. # item["source"] = ProductAttributeService.find_value_source(
  1471. # item["value"], source_map
  1472. # )
  1473. # converted_mandatory[attr].append(item)
  1474. # else:
  1475. # # Convert string to proper format
  1476. # source = ProductAttributeService.find_value_source(str(item), source_map)
  1477. # converted_mandatory[attr].append({"value": str(item), "source": source})
  1478. # else:
  1479. # # Single value - convert to array format
  1480. # source = ProductAttributeService.find_value_source(str(value), source_map)
  1481. # converted_mandatory[attr] = [{"value": str(value), "source": source}]
  1482. # parsed["mandatory"] = converted_mandatory
  1483. # return parsed
  1484. # @staticmethod
  1485. # def _create_error_response(
  1486. # error: str,
  1487. # mandatory_attrs: Dict[str, List[str]],
  1488. # extract_additional: bool,
  1489. # raw_output: Optional[str] = None
  1490. # ) -> dict:
  1491. # """Create a standardized error response in array format."""
  1492. # response = {
  1493. # "mandatory": {attr: [{"value": "Not Specified", "source": "error"}] for attr in mandatory_attrs.keys()},
  1494. # "error": error
  1495. # }
  1496. # if extract_additional:
  1497. # response["additional"] = {}
  1498. # if raw_output:
  1499. # response["raw_output"] = raw_output
  1500. # return response
  1501. # ==================== services.py ====================
  1502. import requests
  1503. import json
  1504. from typing import Dict, List, Optional, Tuple
  1505. from django.conf import settings
  1506. from concurrent.futures import ThreadPoolExecutor, as_completed
  1507. from sentence_transformers import SentenceTransformer, util
  1508. import numpy as np
  1509. from .ocr_service import OCRService
  1510. # Initialize embedding model for normalization
  1511. model_embedder = SentenceTransformer("all-MiniLM-L6-v2")
  1512. class ProductAttributeService:
  1513. """Service class for extracting product attributes using Groq LLM."""
  1514. @staticmethod
  1515. def combine_product_text(
  1516. title: Optional[str] = None,
  1517. short_desc: Optional[str] = None,
  1518. long_desc: Optional[str] = None,
  1519. ocr_text: Optional[str] = None
  1520. ) -> Tuple[str, Dict[str, str]]:
  1521. """
  1522. Combine product metadata into a single text block.
  1523. Returns: (combined_text, source_map) where source_map tracks which text came from where
  1524. """
  1525. parts = []
  1526. source_map = {}
  1527. if title:
  1528. title_str = str(title).strip()
  1529. parts.append(f"Title: {title_str}")
  1530. source_map['title'] = title_str
  1531. if short_desc:
  1532. short_str = str(short_desc).strip()
  1533. parts.append(f"Description: {short_str}")
  1534. source_map['short_desc'] = short_str
  1535. if long_desc:
  1536. long_str = str(long_desc).strip()
  1537. parts.append(f"Details: {long_str}")
  1538. source_map['long_desc'] = long_str
  1539. if ocr_text:
  1540. parts.append(f"OCR Text: {ocr_text}")
  1541. source_map['ocr_text'] = ocr_text
  1542. combined = "\n".join(parts).strip()
  1543. if not combined:
  1544. return "No product information available", {}
  1545. return combined, source_map
  1546. @staticmethod
  1547. def find_value_source(value: str, source_map: Dict[str, str]) -> str:
  1548. """
  1549. Find which source(s) contain the given value.
  1550. Returns the source name(s) where the value appears.
  1551. """
  1552. value_lower = value.lower()
  1553. # Split value into tokens for better matching
  1554. value_tokens = set(value_lower.replace("-", " ").split())
  1555. sources_found = []
  1556. source_scores = {}
  1557. for source_name, source_text in source_map.items():
  1558. source_lower = source_text.lower()
  1559. # Check for exact phrase match first
  1560. if value_lower in source_lower:
  1561. source_scores[source_name] = 1.0
  1562. continue
  1563. # Check for token matches
  1564. token_matches = sum(1 for token in value_tokens if token in source_lower)
  1565. if token_matches > 0:
  1566. source_scores[source_name] = token_matches / len(value_tokens)
  1567. # Return source with highest score, or all sources if multiple have same score
  1568. if source_scores:
  1569. max_score = max(source_scores.values())
  1570. sources_found = [s for s, score in source_scores.items() if score == max_score]
  1571. # Prioritize: title > short_desc > long_desc > ocr_text
  1572. priority = ['title', 'short_desc', 'long_desc', 'ocr_text']
  1573. for p in priority:
  1574. if p in sources_found:
  1575. return p
  1576. return sources_found[0] if sources_found else "Not found"
  1577. return "Not found"
  1578. @staticmethod
  1579. def format_visual_attributes(visual_attributes: Dict) -> Dict:
  1580. """
  1581. Convert visual attributes to array format with source tracking.
  1582. Source is always 'image' for visual attributes.
  1583. """
  1584. formatted = {}
  1585. for key, value in visual_attributes.items():
  1586. if isinstance(value, list):
  1587. # Already a list (like color_palette)
  1588. formatted[key] = [{"value": str(item), "source": "image"} for item in value]
  1589. elif isinstance(value, dict):
  1590. # Nested dictionary - format recursively
  1591. nested_formatted = {}
  1592. for nested_key, nested_value in value.items():
  1593. if isinstance(nested_value, list):
  1594. nested_formatted[nested_key] = [{"value": str(item), "source": "image"} for item in nested_value]
  1595. else:
  1596. nested_formatted[nested_key] = [{"value": str(nested_value), "source": "image"}]
  1597. formatted[key] = nested_formatted
  1598. else:
  1599. # Single value
  1600. formatted[key] = [{"value": str(value), "source": "image"}]
  1601. return formatted
  1602. @staticmethod
  1603. def extract_attributes_from_ocr(ocr_results: Dict, model: str = None) -> Dict:
  1604. """Extract structured attributes from OCR text using LLM."""
  1605. if model is None:
  1606. model = settings.SUPPORTED_MODELS[0]
  1607. detected_text = ocr_results.get('detected_text', [])
  1608. if not detected_text:
  1609. return {}
  1610. # Format OCR text for prompt
  1611. ocr_text = "\n".join([f"Text: {item['text']}, Confidence: {item['confidence']:.2f}"
  1612. for item in detected_text])
  1613. prompt = f"""
  1614. You are an AI model that extracts structured attributes from OCR text detected on product images.
  1615. Given the OCR detections below, infer the possible product attributes and return them as a clean JSON object.
  1616. OCR Text:
  1617. {ocr_text}
  1618. Extract relevant attributes like:
  1619. - brand
  1620. - model_number
  1621. - size (waist_size, length, etc.)
  1622. - collection
  1623. - any other relevant product information
  1624. Return a JSON object with only the attributes you can confidently identify.
  1625. If an attribute is not present, do not include it in the response.
  1626. """
  1627. payload = {
  1628. "model": model,
  1629. "messages": [
  1630. {
  1631. "role": "system",
  1632. "content": "You are a helpful AI that extracts structured data from OCR output. Return only valid JSON."
  1633. },
  1634. {"role": "user", "content": prompt}
  1635. ],
  1636. "temperature": 0.2,
  1637. "max_tokens": 500
  1638. }
  1639. headers = {
  1640. "Authorization": f"Bearer {settings.GROQ_API_KEY}",
  1641. "Content-Type": "application/json",
  1642. }
  1643. try:
  1644. response = requests.post(
  1645. settings.GROQ_API_URL,
  1646. headers=headers,
  1647. json=payload,
  1648. timeout=30
  1649. )
  1650. response.raise_for_status()
  1651. result_text = response.json()["choices"][0]["message"]["content"].strip()
  1652. # Clean and parse JSON
  1653. result_text = ProductAttributeService._clean_json_response(result_text)
  1654. parsed = json.loads(result_text)
  1655. # Convert to array format with source tracking
  1656. formatted_attributes = {}
  1657. for key, value in parsed.items():
  1658. if key == "error":
  1659. continue
  1660. # Handle nested dictionaries (like size)
  1661. if isinstance(value, dict):
  1662. nested_formatted = {}
  1663. for nested_key, nested_value in value.items():
  1664. nested_formatted[nested_key] = [{"value": str(nested_value), "source": "image"}]
  1665. formatted_attributes[key] = nested_formatted
  1666. elif isinstance(value, list):
  1667. # Already a list, convert each item
  1668. formatted_attributes[key] = [{"value": str(item), "source": "image"} for item in value]
  1669. else:
  1670. # Single value
  1671. formatted_attributes[key] = [{"value": str(value), "source": "image"}]
  1672. return formatted_attributes
  1673. except Exception as e:
  1674. return {"error": f"Failed to extract attributes from OCR: {str(e)}"}
  1675. @staticmethod
  1676. def calculate_attribute_relationships(
  1677. mandatory_attrs: Dict[str, List[str]],
  1678. product_text: str
  1679. ) -> Dict[str, float]:
  1680. """
  1681. Calculate semantic relationships between attribute values across different attributes.
  1682. Returns a matrix of cross-attribute value similarities.
  1683. """
  1684. pt_emb = model_embedder.encode(product_text, convert_to_tensor=True)
  1685. # Calculate similarities between all attribute values and product text
  1686. attr_scores = {}
  1687. for attr, values in mandatory_attrs.items():
  1688. attr_scores[attr] = {}
  1689. for val in values:
  1690. contexts = [val, f"for {val}", f"use in {val}", f"suitable for {val}"]
  1691. ctx_embs = [model_embedder.encode(c, convert_to_tensor=True) for c in contexts]
  1692. sem_sim = max(float(util.cos_sim(pt_emb, ce).item()) for ce in ctx_embs)
  1693. attr_scores[attr][val] = sem_sim
  1694. # Calculate cross-attribute value relationships
  1695. relationships = {}
  1696. attr_list = list(mandatory_attrs.keys())
  1697. for i, attr1 in enumerate(attr_list):
  1698. for attr2 in attr_list[i+1:]:
  1699. # Calculate pairwise similarities between values of different attributes
  1700. for val1 in mandatory_attrs[attr1]:
  1701. for val2 in mandatory_attrs[attr2]:
  1702. emb1 = model_embedder.encode(val1, convert_to_tensor=True)
  1703. emb2 = model_embedder.encode(val2, convert_to_tensor=True)
  1704. sim = float(util.cos_sim(emb1, emb2).item())
  1705. # Store bidirectional relationships
  1706. key1 = f"{attr1}:{val1}->{attr2}:{val2}"
  1707. key2 = f"{attr2}:{val2}->{attr1}:{val1}"
  1708. relationships[key1] = sim
  1709. relationships[key2] = sim
  1710. return relationships
  1711. @staticmethod
  1712. def calculate_value_clusters(
  1713. values: List[str],
  1714. scores: List[Tuple[str, float]],
  1715. cluster_threshold: float = 0.4
  1716. ) -> List[List[str]]:
  1717. """
  1718. Group values into semantic clusters based on their similarity to each other.
  1719. Returns clusters of related values.
  1720. """
  1721. if len(values) <= 1:
  1722. return [[val] for val, _ in scores]
  1723. # Get embeddings for all values
  1724. embeddings = [model_embedder.encode(val, convert_to_tensor=True) for val in values]
  1725. # Calculate pairwise similarities
  1726. similarity_matrix = np.zeros((len(values), len(values)))
  1727. for i in range(len(values)):
  1728. for j in range(i+1, len(values)):
  1729. sim = float(util.cos_sim(embeddings[i], embeddings[j]).item())
  1730. similarity_matrix[i][j] = sim
  1731. similarity_matrix[j][i] = sim
  1732. # Simple clustering: group values with high similarity
  1733. clusters = []
  1734. visited = set()
  1735. for i, (val, score) in enumerate(scores):
  1736. if i in visited:
  1737. continue
  1738. cluster = [val]
  1739. visited.add(i)
  1740. # Find similar values
  1741. for j in range(len(values)):
  1742. if j not in visited and similarity_matrix[i][j] >= cluster_threshold:
  1743. cluster.append(values[j])
  1744. visited.add(j)
  1745. clusters.append(cluster)
  1746. return clusters
  1747. @staticmethod
  1748. def get_dynamic_threshold(
  1749. attr: str,
  1750. val: str,
  1751. base_score: float,
  1752. extracted_attrs: Dict[str, List[Dict[str, str]]],
  1753. relationships: Dict[str, float],
  1754. mandatory_attrs: Dict[str, List[str]],
  1755. base_threshold: float = 0.65,
  1756. boost_factor: float = 0.15
  1757. ) -> float:
  1758. """
  1759. Calculate dynamic threshold based on relationships with already-extracted attributes.
  1760. """
  1761. threshold = base_threshold
  1762. # Check relationships with already extracted attributes
  1763. max_relationship = 0.0
  1764. for other_attr, other_values_list in extracted_attrs.items():
  1765. if other_attr == attr:
  1766. continue
  1767. for other_val_dict in other_values_list:
  1768. other_val = other_val_dict['value']
  1769. key = f"{attr}:{val}->{other_attr}:{other_val}"
  1770. if key in relationships:
  1771. max_relationship = max(max_relationship, relationships[key])
  1772. # If strong relationship exists, lower threshold
  1773. if max_relationship > 0.6:
  1774. threshold = base_threshold - (boost_factor * max_relationship)
  1775. return max(0.3, threshold)
  1776. @staticmethod
  1777. def get_adaptive_margin(
  1778. scores: List[Tuple[str, float]],
  1779. base_margin: float = 0.15,
  1780. max_margin: float = 0.22
  1781. ) -> float:
  1782. """
  1783. Calculate adaptive margin based on score distribution.
  1784. """
  1785. if len(scores) < 2:
  1786. return base_margin
  1787. score_values = [s for _, s in scores]
  1788. best_score = score_values[0]
  1789. # If best score is very low, use adaptive margin but be more conservative
  1790. if best_score < 0.5:
  1791. # Calculate score spread in top 3-4 scores only (more selective)
  1792. top_scores = score_values[:min(4, len(score_values))]
  1793. score_range = max(top_scores) - min(top_scores)
  1794. # Very controlled margin increase
  1795. if score_range < 0.30:
  1796. # Much more conservative scaling
  1797. score_factor = (0.5 - best_score) * 0.35
  1798. adaptive = base_margin + score_factor + (0.30 - score_range) * 0.2
  1799. return min(adaptive, max_margin)
  1800. return base_margin
  1801. @staticmethod
  1802. def _lexical_evidence(product_text: str, label: str) -> float:
  1803. """Calculate lexical overlap between product text and label."""
  1804. pt = product_text.lower()
  1805. tokens = [t for t in label.lower().replace("-", " ").split() if t]
  1806. if not tokens:
  1807. return 0.0
  1808. hits = sum(1 for t in tokens if t in pt)
  1809. return hits / len(tokens)
  1810. @staticmethod
  1811. def normalize_against_product_text(
  1812. product_text: str,
  1813. mandatory_attrs: Dict[str, List[str]],
  1814. source_map: Dict[str, str],
  1815. threshold_abs: float = 0.65,
  1816. margin: float = 0.15,
  1817. allow_multiple: bool = False,
  1818. sem_weight: float = 0.8,
  1819. lex_weight: float = 0.2,
  1820. extracted_attrs: Optional[Dict[str, List[Dict[str, str]]]] = None,
  1821. relationships: Optional[Dict[str, float]] = None,
  1822. use_dynamic_thresholds: bool = True,
  1823. use_adaptive_margin: bool = True,
  1824. use_semantic_clustering: bool = True
  1825. ) -> dict:
  1826. """
  1827. Score each allowed value against the product_text with dynamic thresholds.
  1828. Returns dict with values in array format: [{"value": "...", "source": "..."}]
  1829. """
  1830. if extracted_attrs is None:
  1831. extracted_attrs = {}
  1832. if relationships is None:
  1833. relationships = {}
  1834. pt_emb = model_embedder.encode(product_text, convert_to_tensor=True)
  1835. extracted = {}
  1836. for attr, allowed_values in mandatory_attrs.items():
  1837. scores: List[Tuple[str, float]] = []
  1838. for val in allowed_values:
  1839. contexts = [val, f"for {val}", f"use in {val}", f"suitable for {val}", f"{val} room"]
  1840. ctx_embs = [model_embedder.encode(c, convert_to_tensor=True) for c in contexts]
  1841. sem_sim = max(float(util.cos_sim(pt_emb, ce).item()) for ce in ctx_embs)
  1842. lex_score = ProductAttributeService._lexical_evidence(product_text, val)
  1843. final_score = sem_weight * sem_sim + lex_weight * lex_score
  1844. scores.append((val, final_score))
  1845. scores.sort(key=lambda x: x[1], reverse=True)
  1846. best_val, best_score = scores[0]
  1847. # Calculate adaptive margin if enabled
  1848. effective_margin = margin
  1849. if allow_multiple and use_adaptive_margin:
  1850. effective_margin = ProductAttributeService.get_adaptive_margin(scores, margin)
  1851. if not allow_multiple:
  1852. source = ProductAttributeService.find_value_source(best_val, source_map)
  1853. extracted[attr] = [{"value": best_val, "source": source}]
  1854. else:
  1855. candidates = [best_val]
  1856. use_base_threshold = best_score >= threshold_abs
  1857. # Get semantic clusters if enabled
  1858. clusters = []
  1859. if use_semantic_clustering:
  1860. clusters = ProductAttributeService.calculate_value_clusters(
  1861. allowed_values, scores, cluster_threshold=0.4
  1862. )
  1863. best_cluster = next((c for c in clusters if best_val in c), [best_val])
  1864. for val, sc in scores[1:]:
  1865. # Calculate dynamic threshold for this value
  1866. if use_dynamic_thresholds and extracted_attrs:
  1867. dynamic_thresh = ProductAttributeService.get_dynamic_threshold(
  1868. attr, val, sc, extracted_attrs, relationships,
  1869. mandatory_attrs, threshold_abs
  1870. )
  1871. else:
  1872. dynamic_thresh = threshold_abs
  1873. within_margin = (best_score - sc) <= effective_margin
  1874. above_threshold = sc >= dynamic_thresh
  1875. # Check if in same semantic cluster as best value
  1876. in_cluster = False
  1877. if use_semantic_clustering and clusters:
  1878. in_cluster = any(best_val in c and val in c for c in clusters)
  1879. if use_base_threshold:
  1880. # Best score is good, require threshold OR (cluster + margin)
  1881. if above_threshold and within_margin:
  1882. candidates.append(val)
  1883. elif in_cluster and within_margin:
  1884. candidates.append(val)
  1885. else:
  1886. # Best score is low, use margin OR cluster logic
  1887. if within_margin:
  1888. candidates.append(val)
  1889. elif in_cluster and (best_score - sc) <= effective_margin * 2.0:
  1890. # Extended margin for cluster members
  1891. candidates.append(val)
  1892. # Map each candidate to its source and create array format
  1893. extracted[attr] = []
  1894. for candidate in candidates:
  1895. source = ProductAttributeService.find_value_source(candidate, source_map)
  1896. extracted[attr].append({"value": candidate, "source": source})
  1897. return extracted
  1898. @staticmethod
  1899. def extract_attributes(
  1900. product_text: str,
  1901. mandatory_attrs: Dict[str, List[str]],
  1902. source_map: Dict[str, str] = None,
  1903. model: str = None,
  1904. extract_additional: bool = True,
  1905. multiple: Optional[List[str]] = None,
  1906. threshold_abs: float = 0.65,
  1907. margin: float = 0.15,
  1908. use_dynamic_thresholds: bool = True,
  1909. use_adaptive_margin: bool = True,
  1910. use_semantic_clustering: bool = True
  1911. ) -> dict:
  1912. """
  1913. Use Groq LLM to extract attributes from any product type with enhanced multi-value selection.
  1914. Now returns values in array format: [{"value": "...", "source": "..."}]
  1915. """
  1916. if model is None:
  1917. model = settings.SUPPORTED_MODELS[0]
  1918. if multiple is None:
  1919. multiple = []
  1920. if source_map is None:
  1921. source_map = {}
  1922. # Check if product text is empty or minimal
  1923. if not product_text or product_text == "No product information available":
  1924. return ProductAttributeService._create_error_response(
  1925. "No product information provided",
  1926. mandatory_attrs,
  1927. extract_additional
  1928. )
  1929. # Create structured prompt for mandatory attributes
  1930. mandatory_attr_list = []
  1931. for attr_name, allowed_values in mandatory_attrs.items():
  1932. mandatory_attr_list.append(f"{attr_name}: {', '.join(allowed_values)}")
  1933. mandatory_attr_text = "\n".join(mandatory_attr_list)
  1934. additional_instruction = ""
  1935. if extract_additional:
  1936. additional_instruction = """
  1937. 2. Extract ADDITIONAL attributes: Identify any other relevant attributes from the product text
  1938. that are NOT in the mandatory list. Only include attributes where you can find actual values
  1939. in the product text. Do NOT include attributes with "Not Specified" or empty values.
  1940. Examples of attributes to look for (only if present): Brand, Material, Size, Color, Dimensions,
  1941. Weight, Features, Style, Theme, Pattern, Finish, Care Instructions, etc."""
  1942. output_format = {
  1943. "mandatory": {attr: "value or list of values" for attr in mandatory_attrs.keys()},
  1944. }
  1945. if extract_additional:
  1946. output_format["additional"] = {
  1947. "example_attribute_1": "actual value found",
  1948. "example_attribute_2": "actual value found"
  1949. }
  1950. output_format["additional"]["_note"] = "Only include attributes with actual values found in text"
  1951. prompt = f"""
  1952. You are an intelligent product attribute extractor that works with ANY product type.
  1953. TASK:
  1954. 1. Extract MANDATORY attributes: For each mandatory attribute, select the most appropriate value(s)
  1955. from the provided list. Choose the value(s) that best match the product description.
  1956. {additional_instruction}
  1957. Product Text:
  1958. {product_text}
  1959. Mandatory Attribute Lists (MUST select from these allowed values):
  1960. {mandatory_attr_text}
  1961. CRITICAL INSTRUCTIONS:
  1962. - Return ONLY valid JSON, nothing else
  1963. - No explanations, no markdown, no text before or after the JSON
  1964. - For mandatory attributes, choose the value(s) from the provided list that best match
  1965. - If a mandatory attribute cannot be determined from the product text, use "Not Specified"
  1966. - Prefer exact matches from the allowed values list over generic synonyms
  1967. - If multiple values are plausible, you MAY return more than one
  1968. {f"- For additional attributes: ONLY include attributes where you found actual values in the product text. DO NOT include attributes with 'Not Specified', 'None', 'N/A', or empty values. If you cannot find a value for an attribute, simply don't include that attribute." if extract_additional else ""}
  1969. - Be precise and only extract information that is explicitly stated or clearly implied
  1970. Required Output Format:
  1971. {json.dumps(output_format, indent=2)}
  1972. """
  1973. payload = {
  1974. "model": model,
  1975. "messages": [
  1976. {
  1977. "role": "system",
  1978. "content": f"You are a precise attribute extraction model. Return ONLY valid JSON with {'mandatory and additional' if extract_additional else 'mandatory'} sections. No explanations, no markdown, no other text."
  1979. },
  1980. {"role": "user", "content": prompt}
  1981. ],
  1982. "temperature": 0.0,
  1983. "max_tokens": 1500
  1984. }
  1985. headers = {
  1986. "Authorization": f"Bearer {settings.GROQ_API_KEY}",
  1987. "Content-Type": "application/json",
  1988. }
  1989. try:
  1990. response = requests.post(
  1991. settings.GROQ_API_URL,
  1992. headers=headers,
  1993. json=payload,
  1994. timeout=30
  1995. )
  1996. response.raise_for_status()
  1997. result_text = response.json()["choices"][0]["message"]["content"].strip()
  1998. # Clean the response
  1999. result_text = ProductAttributeService._clean_json_response(result_text)
  2000. # Parse JSON
  2001. parsed = json.loads(result_text)
  2002. # Validate and restructure with source tracking
  2003. parsed = ProductAttributeService._validate_response_structure(
  2004. parsed, mandatory_attrs, extract_additional, source_map
  2005. )
  2006. # Clean up and add source tracking to additional attributes in array format
  2007. if extract_additional and "additional" in parsed:
  2008. cleaned_additional = {}
  2009. for k, v in parsed["additional"].items():
  2010. if v and v not in ["Not Specified", "None", "N/A", "", "not specified", "none", "n/a"]:
  2011. if not (isinstance(v, str) and v.lower() in ["not specified", "none", "n/a", ""]):
  2012. # Convert to array format if not already
  2013. if isinstance(v, list):
  2014. cleaned_additional[k] = []
  2015. for item in v:
  2016. if isinstance(item, dict) and "value" in item:
  2017. if "source" not in item:
  2018. item["source"] = ProductAttributeService.find_value_source(
  2019. item["value"], source_map
  2020. )
  2021. cleaned_additional[k].append(item)
  2022. else:
  2023. source = ProductAttributeService.find_value_source(str(item), source_map)
  2024. cleaned_additional[k].append({"value": str(item), "source": source})
  2025. else:
  2026. source = ProductAttributeService.find_value_source(str(v), source_map)
  2027. cleaned_additional[k] = [{"value": str(v), "source": source}]
  2028. parsed["additional"] = cleaned_additional
  2029. # Calculate attribute relationships if using dynamic thresholds
  2030. relationships = {}
  2031. if use_dynamic_thresholds:
  2032. relationships = ProductAttributeService.calculate_attribute_relationships(
  2033. mandatory_attrs, product_text
  2034. )
  2035. # Process attributes in order, allowing earlier ones to influence later ones
  2036. extracted_so_far = {}
  2037. for attr in mandatory_attrs.keys():
  2038. allow_multiple = attr in multiple
  2039. result = ProductAttributeService.normalize_against_product_text(
  2040. product_text=product_text,
  2041. mandatory_attrs={attr: mandatory_attrs[attr]},
  2042. source_map=source_map,
  2043. threshold_abs=threshold_abs,
  2044. margin=margin,
  2045. allow_multiple=allow_multiple,
  2046. extracted_attrs=extracted_so_far,
  2047. relationships=relationships,
  2048. use_dynamic_thresholds=use_dynamic_thresholds,
  2049. use_adaptive_margin=use_adaptive_margin,
  2050. use_semantic_clustering=use_semantic_clustering
  2051. )
  2052. # Result is already in array format from normalize_against_product_text
  2053. parsed["mandatory"][attr] = result[attr]
  2054. extracted_so_far[attr] = result[attr]
  2055. return parsed
  2056. except requests.exceptions.RequestException as e:
  2057. return ProductAttributeService._create_error_response(
  2058. str(e), mandatory_attrs, extract_additional
  2059. )
  2060. except json.JSONDecodeError as e:
  2061. return ProductAttributeService._create_error_response(
  2062. f"Invalid JSON: {str(e)}", mandatory_attrs, extract_additional, result_text
  2063. )
  2064. except Exception as e:
  2065. return ProductAttributeService._create_error_response(
  2066. str(e), mandatory_attrs, extract_additional
  2067. )
  2068. @staticmethod
  2069. def extract_attributes_batch(
  2070. products: List[Dict],
  2071. mandatory_attrs: Dict[str, List[str]],
  2072. model: str = None,
  2073. extract_additional: bool = True,
  2074. process_image: bool = True,
  2075. max_workers: int = 5,
  2076. multiple: Optional[List[str]] = None,
  2077. threshold_abs: float = 0.65,
  2078. margin: float = 0.15,
  2079. use_dynamic_thresholds: bool = True,
  2080. use_adaptive_margin: bool = True,
  2081. use_semantic_clustering: bool = True
  2082. ) -> Dict:
  2083. """Extract attributes for multiple products in parallel with enhanced multi-value selection and source tracking."""
  2084. results = []
  2085. successful = 0
  2086. failed = 0
  2087. ocr_service = OCRService()
  2088. if multiple is None:
  2089. multiple = []
  2090. def process_product(product_data):
  2091. """Process a single product."""
  2092. product_id = product_data.get('product_id', f"product_{len(results)}")
  2093. try:
  2094. # Process image if URL is provided
  2095. ocr_results = None
  2096. ocr_text = None
  2097. if process_image and product_data.get('image_url'):
  2098. ocr_results = ocr_service.process_image(product_data['image_url'])
  2099. # Extract attributes from OCR
  2100. if ocr_results and ocr_results.get('detected_text'):
  2101. ocr_attrs = ProductAttributeService.extract_attributes_from_ocr(
  2102. ocr_results, model
  2103. )
  2104. ocr_results['extracted_attributes'] = ocr_attrs
  2105. # Format OCR text for combining with product text
  2106. ocr_text = "\n".join([
  2107. f"{item['text']} (confidence: {item['confidence']:.2f})"
  2108. for item in ocr_results['detected_text']
  2109. ])
  2110. # Combine all product information with source tracking
  2111. product_text, source_map = ProductAttributeService.combine_product_text(
  2112. title=product_data.get('title'),
  2113. short_desc=product_data.get('short_desc'),
  2114. long_desc=product_data.get('long_desc'),
  2115. ocr_text=ocr_text
  2116. )
  2117. # Extract attributes from combined text with enhanced features
  2118. result = ProductAttributeService.extract_attributes(
  2119. product_text=product_text,
  2120. mandatory_attrs=mandatory_attrs,
  2121. source_map=source_map,
  2122. model=model,
  2123. extract_additional=extract_additional,
  2124. multiple=multiple,
  2125. threshold_abs=threshold_abs,
  2126. margin=margin,
  2127. use_dynamic_thresholds=use_dynamic_thresholds,
  2128. use_adaptive_margin=use_adaptive_margin,
  2129. use_semantic_clustering=use_semantic_clustering
  2130. )
  2131. result['product_id'] = product_id
  2132. # Add OCR results if available (already in correct format)
  2133. if ocr_results:
  2134. result['ocr_results'] = ocr_results
  2135. # Check if extraction was successful
  2136. if 'error' not in result:
  2137. return result, True
  2138. else:
  2139. return result, False
  2140. except Exception as e:
  2141. return {
  2142. 'product_id': product_id,
  2143. 'mandatory': {attr: [{"value": "Not Specified", "source": "error"}] for attr in mandatory_attrs.keys()},
  2144. 'additional': {} if extract_additional else None,
  2145. 'error': f"Processing error: {str(e)}"
  2146. }, False
  2147. # Process products in parallel
  2148. with ThreadPoolExecutor(max_workers=max_workers) as executor:
  2149. future_to_product = {
  2150. executor.submit(process_product, product): product
  2151. for product in products
  2152. }
  2153. for future in as_completed(future_to_product):
  2154. try:
  2155. result, success = future.result()
  2156. results.append(result)
  2157. if success:
  2158. successful += 1
  2159. else:
  2160. failed += 1
  2161. except Exception as e:
  2162. failed += 1
  2163. results.append({
  2164. 'product_id': 'unknown',
  2165. 'mandatory': {attr: [{"value": "Not Specified", "source": "error"}] for attr in mandatory_attrs.keys()},
  2166. 'additional': {} if extract_additional else None,
  2167. 'error': f"Unexpected error: {str(e)}"
  2168. })
  2169. return {
  2170. 'results': results,
  2171. 'total_products': len(products),
  2172. 'successful': successful,
  2173. 'failed': failed
  2174. }
  2175. @staticmethod
  2176. def _clean_json_response(text: str) -> str:
  2177. """Clean LLM response to extract valid JSON."""
  2178. start_idx = text.find('{')
  2179. end_idx = text.rfind('}')
  2180. if start_idx != -1 and end_idx != -1:
  2181. text = text[start_idx:end_idx + 1]
  2182. if "```json" in text:
  2183. text = text.split("```json")[1].split("```")[0].strip()
  2184. elif "```" in text:
  2185. text = text.split("```")[1].split("```")[0].strip()
  2186. if text.startswith("json"):
  2187. text = text[4:].strip()
  2188. return text
  2189. @staticmethod
  2190. def _validate_response_structure(
  2191. parsed: dict,
  2192. mandatory_attrs: Dict[str, List[str]],
  2193. extract_additional: bool,
  2194. source_map: Dict[str, str] = None
  2195. ) -> dict:
  2196. """Validate and fix the response structure, ensuring array format with source tracking."""
  2197. if source_map is None:
  2198. source_map = {}
  2199. expected_sections = ["mandatory"]
  2200. if extract_additional:
  2201. expected_sections.append("additional")
  2202. if not all(section in parsed for section in expected_sections):
  2203. if isinstance(parsed, dict):
  2204. mandatory_keys = set(mandatory_attrs.keys())
  2205. mandatory = {k: v for k, v in parsed.items() if k in mandatory_keys}
  2206. additional = {k: v for k, v in parsed.items() if k not in mandatory_keys}
  2207. result = {"mandatory": mandatory}
  2208. if extract_additional:
  2209. result["additional"] = additional
  2210. parsed = result
  2211. else:
  2212. return ProductAttributeService._create_error_response(
  2213. "Invalid response structure",
  2214. mandatory_attrs,
  2215. extract_additional,
  2216. str(parsed)
  2217. )
  2218. # Convert mandatory attributes to array format with source tracking
  2219. if "mandatory" in parsed:
  2220. converted_mandatory = {}
  2221. for attr, value in parsed["mandatory"].items():
  2222. if isinstance(value, list):
  2223. # Already in array format, ensure each item has source
  2224. converted_mandatory[attr] = []
  2225. for item in value:
  2226. if isinstance(item, dict) and "value" in item:
  2227. # Already has proper structure
  2228. if "source" not in item:
  2229. item["source"] = ProductAttributeService.find_value_source(
  2230. item["value"], source_map
  2231. )
  2232. converted_mandatory[attr].append(item)
  2233. else:
  2234. # Convert string to proper format
  2235. source = ProductAttributeService.find_value_source(str(item), source_map)
  2236. converted_mandatory[attr].append({"value": str(item), "source": source})
  2237. else:
  2238. # Single value - convert to array format
  2239. source = ProductAttributeService.find_value_source(str(value), source_map)
  2240. converted_mandatory[attr] = [{"value": str(value), "source": source}]
  2241. parsed["mandatory"] = converted_mandatory
  2242. return parsed
  2243. @staticmethod
  2244. def _create_error_response(
  2245. error: str,
  2246. mandatory_attrs: Dict[str, List[str]],
  2247. extract_additional: bool,
  2248. raw_output: Optional[str] = None
  2249. ) -> dict:
  2250. """Create a standardized error response in array format."""
  2251. response = {
  2252. "mandatory": {attr: [{"value": "Not Specified", "source": "error"}] for attr in mandatory_attrs.keys()},
  2253. "error": error
  2254. }
  2255. if extract_additional:
  2256. response["additional"] = {}
  2257. if raw_output:
  2258. response["raw_output"] = raw_output
  2259. return response