services.py 39 KB


  1. # ################## EDITING PROMPT OF ABOVE VERSION ONLY #################
  2. # import json
  3. # import hashlib
  4. # import logging
  5. # import time
  6. # from functools import wraps
  7. # from typing import Dict, List, Optional, Tuple
  8. # import requests
  9. # from django.conf import settings
  10. # from .llm_load_balancer import call_llm_with_load_balancer
  11. # from .cache_config import (
  12. # is_caching_enabled,
  13. # ENABLE_ATTRIBUTE_EXTRACTION_CACHE,
  14. # ATTRIBUTE_CACHE_MAX_SIZE,
  15. # )
  16. # logger = logging.getLogger(__name__)
  17. # # --------------------------------------------------------------------------- #
  18. # # CACHES
  19. # # --------------------------------------------------------------------------- #
  20. # class SimpleCache:
  21. # _cache = {}
  22. # _max_size = ATTRIBUTE_CACHE_MAX_SIZE
  23. # @classmethod
  24. # def get(cls, key: str) -> Optional[Dict]:
  25. # if not ENABLE_ATTRIBUTE_EXTRACTION_CACHE: return None
  26. # return cls._cache.get(key)
  27. # @classmethod
  28. # def set(cls, key: str, value: Dict):
  29. # if not ENABLE_ATTRIBUTE_EXTRACTION_CACHE: return
  30. # if len(cls._cache) >= cls._max_size:
  31. # items = list(cls._cache.items())
  32. # cls._cache = dict(items[int(cls._max_size * 0.2):])
  33. # cls._cache[key] = value
  34. # @classmethod
  35. # def clear(cls): cls._cache.clear()
  36. # @classmethod
  37. # def get_stats(cls) -> Dict:
  38. # return {
  39. # "enabled": ENABLE_ATTRIBUTE_EXTRACTION_CACHE,
  40. # "size": len(cls._cache),
  41. # "max_size": cls._max_size,
  42. # "usage_percent": round(len(cls._cache)/cls._max_size*100, 2) if cls._max_size else 0
  43. # }
  44. # # --------------------------------------------------------------------------- #
  45. # # RETRY DECORATOR
  46. # # --------------------------------------------------------------------------- #
  47. # def retry(max_attempts=3, delay=1.0):
  48. # def decorator(f):
  49. # @wraps(f)
  50. # def wrapper(*args, **kwargs):
  51. # last_exc = None
  52. # for i in range(max_attempts):
  53. # try:
  54. # return f(*args, **kwargs)
  55. # except Exception as e:
  56. # last_exc = e
  57. # if i < max_attempts - 1:
  58. # wait = delay * (2 ** i)
  59. # logger.warning(f"Retry {i+1}/{max_attempts} after {wait}s: {e}")
  60. # time.sleep(wait)
  61. # raise last_exc or RuntimeError("Retry failed")
  62. # return wrapper
  63. # return decorator
  64. # # --------------------------------------------------------------------------- #
  65. # # MAIN SERVICE
  66. # # --------------------------------------------------------------------------- #
  67. # class ProductAttributeService:
  68. # @staticmethod
  69. # def combine_product_text(title=None, short_desc=None, long_desc=None, ocr_text=None) -> Tuple[str, Dict[str, str]]:
  70. # parts = []
  71. # source_map = {}
  72. # if title:
  73. # t = str(title).strip()
  74. # parts.append(f"Title: {t}")
  75. # source_map["title"] = t
  76. # if short_desc:
  77. # s = str(short_desc).strip()
  78. # parts.append(f"Description: {s}")
  79. # source_map["short_desc"] = s
  80. # if long_desc:
  81. # l = str(long_desc).strip()
  82. # parts.append(f"Details: {l}")
  83. # source_map["long_desc"] = l
  84. # if ocr_text:
  85. # parts.append(f"OCR Text: {ocr_text}")
  86. # source_map["ocr_text"] = ocr_text
  87. # combined = "\n".join(parts).strip()
  88. # return (combined or "No product information", source_map)
  89. # @staticmethod
  90. # def _cache_key(product_text: str, mandatory_attrs: Dict, extract_additional: bool, multiple: List[str], user_values: Dict = None) -> str:
  91. # payload = {
  92. # "text": product_text,
  93. # "attrs": mandatory_attrs,
  94. # "extra": extract_additional,
  95. # "multiple": sorted(multiple),
  96. # "user_values": user_values or {}
  97. # }
  98. # return f"attr_{hashlib.md5(json.dumps(payload, sort_keys=True).encode()).hexdigest()}"
  99. # # @staticmethod
  100. # # def _clean_json(text: str) -> str:
  101. # # start = text.find("{")
  102. # # end = text.rfind("}") + 1
  103. # # if start != -1 and end > start:
  104. # # text = text[start:end]
  105. # # if "```json" in text:
  106. # # text = text.split("```json", 1)[1].split("```", 1)[0]
  107. # # elif "```" in text:
  108. # # text = text.split("```", 1)[1].split("```", 1)[0]
  109. # # if text.lstrip().startswith("json"): text = text[4:]
  110. # # return text.strip()
  111. # @staticmethod
  112. # def _clean_json(text: str) -> str:
  113. # text = text.strip()
  114. # # Extract JSON block if wrapped in ```json or ```
  115. # if "```json" in text:
  116. # text = text.split("```json", 1)[1].split("```", 1)[0]
  117. # elif "```" in text:
  118. # parts = text.split("```", 2)
  119. # if len(parts) > 1:
  120. # text = parts[1]
  121. # # Find first { and last }
  122. # start = text.find("{")
  123. # end = text.rfind("}") + 1
  124. # if start == -1 or end <= start:
  125. # raise ValueError("No JSON object found in LLM response")
  126. # text = text[start:end]
  127. # return text.strip()
  128. # @staticmethod
  129. # def format_visual_attributes(visual_attributes: Dict) -> Dict:
  130. # formatted = {}
  131. # for key, value in visual_attributes.items():
  132. # if isinstance(value, list):
  133. # formatted[key] = [{"value": str(item), "source": "image"} for item in value]
  134. # elif isinstance(value, dict):
  135. # nested = {}
  136. # for sub_key, sub_val in value.items():
  137. # if isinstance(sub_val, list):
  138. # nested[sub_key] = [{"value": str(v), "source": "image"} for v in sub_val]
  139. # else:
  140. # nested[sub_key] = [{"value": str(sub_val), "source": "image"}]
  141. # formatted[key] = nested
  142. # else:
  143. # formatted[key] = [{"value": str(value), "source": "image"}]
  144. # return formatted
  145. # # @staticmethod
  146. # # @retry(max_attempts=3, delay=1.0)
  147. # # def _call_llm(payload: dict) -> str:
  148. # # headers = {"Authorization": f"Bearer {settings.GROQ_API_KEY}", "Content-Type": "application/json"}
  149. # # resp = requests.post(settings.GROQ_API_URL, headers=headers, json=payload, timeout=30)
  150. # # resp.raise_for_status()
  151. # # return resp.json()["choices"][0]["message"]["content"]
  152. # # At the top of services.py, add this import
  153. # # from . import call_llm_with_load_balancer, get_load_balancer_stats
  154. # # Replace the existing _call_llm method with this:
  155. # @staticmethod
  156. # @retry(max_attempts=3, delay=3.0)
  157. # def _call_llm(payload: dict) -> str:
  158. # """
  159. # Call LLM using load balancer with multiple API keys
  160. # Automatically handles rate limiting and failover
  161. # """
  162. # return call_llm_with_load_balancer(payload)
  163. # @staticmethod
  164. # def extract_attributes(
  165. # product_text: str,
  166. # mandatory_attrs: Dict[str, List[str]],
  167. # source_map: Dict[str, str] = None,
  168. # model: str = None,
  169. # extract_additional: bool = True,
  170. # multiple: Optional[List[str]] = None,
  171. # use_cache: Optional[bool] = None,
  172. # user_entered_values: Optional[Dict[str, str]] = None, # NEW PARAMETER
  173. # ) -> dict:
  174. # if model is None: model = settings.SUPPORTED_MODELS[0]
  175. # if multiple is None: multiple = []
  176. # if source_map is None: source_map = {}
  177. # if user_entered_values is None: user_entered_values = {}
  178. # if use_cache is None: use_cache = ENABLE_ATTRIBUTE_EXTRACTION_CACHE
  179. # if not is_caching_enabled(): use_cache = False
  180. # cache_key = None
  181. # if use_cache:
  182. # cache_key = ProductAttributeService._cache_key(
  183. # product_text, mandatory_attrs, extract_additional, multiple, user_entered_values
  184. # )
  185. # cached = SimpleCache.get(cache_key)
  186. # if cached:
  187. # logger.info(f"CACHE HIT {cache_key[:16]}...")
  188. # return cached
  189. # # --------------------------- BUILD USER VALUES SECTION ---------------------------
  190. # user_values_section = ""
  191. # if user_entered_values:
  192. # user_lines = []
  193. # for attr, value in user_entered_values.items():
  194. # user_lines.append(f" - {attr}: {value}")
  195. # user_values_section = f"""
  196. # USER MANUALLY ENTERED VALUES:
  197. # {chr(10).join(user_lines)}
  198. # IMPORTANT INSTRUCTIONS FOR USER VALUES:
  199. # 1. Choose the BEST value (could be user's value, or from allowed list, or inferred)
  200. # 2. Always provide a "reason" field explaining your decision. Your reason should be valid and from the product text. Not always exact word to be matched from the product text, you can infer understanding the product text.
  201. # 3. DO NOT hallucinate - be honest if user's value seems wrong based on product evidence
  202. # 4. If user's value is not in the allowed list but seems correct, chose the most nearest value from the allowed list with proper reasoning why it chose it. Also in this case give the most apt value that should be added in the possible list in the reason so that user can edit it later.
  203. # """
  204. # # --------------------------- PROMPT ---------------------------
  205. # allowed_lines = [f"{attr}: {', '.join(vals)}" for attr, vals in mandatory_attrs.items()]
  206. # allowed_text = "\n".join(allowed_lines)
  207. # print(f"allowed text is {allowed_text}")
  208. # allowed_sources = list(source_map.keys()) + ["title", "description", "inferred"]
  209. # print(f"allowed_sources is {allowed_sources}")
  210. # source_hint = "|".join(allowed_sources)
  211. # multiple_text = f"\nMULTIPLE ALLOWED FOR: {', '.join(multiple)}" if multiple else ""
  212. # if extract_additional:
  213. # additional_instructions = """
  214. # For the 'additional' section, identify any other important product attributes and their values (e.g., 'Color', 'Material', 'Weight' etc according to the product text) that are present in the PRODUCT TEXT but not in the Mandatory Attribute list.
  215. # For each additional attribute, use the best available value from the PRODUCT TEXT and specify the 'source'.
  216. # Strictly Extract other key attributes other than mandatory attributes from the text.
  217. # """
  218. # output_example_additional = """
  219. # "additional": {
  220. # "Additional_Attr_1": [{
  221. # "value": "Value 1",
  222. # "source": "<{source_hint}>",
  223. # "reason": "Why this attribute and value were identified"
  224. # }]
  225. # }
  226. # """
  227. # else:
  228. # additional_instructions = """
  229. # Do not identify or include any additional attributes. The 'additional' section must be an empty object {}.
  230. # """
  231. # output_example_additional = ' "additional": {}'
  232. # prompt = f"""
  233. # You are an expert product attribute extraction system. Your task is to classify products into predefined categories with 100% accuracy.
  234. # CORE RULES - FOLLOW STRICTLY:
  235. # ***CRITICAL T-SHIRT RULE – ALWAYS APPLY FIRST***
  236. # For ANY generic short-sleeve or long-sleeve T-shirt that is pulled over the head
  237. # (no zipper, no buttons, no special silhouette mentioned) you **MUST** classify
  238. # style_clothing_top as "Pullover". This overrides everything else for basic tees.
  239. # Examples: crew neck tee, v-neck tee, graphic tee, plain tee → ALWAYS Pullover.
  240. # 1. You MUST ONLY use values from the ALLOWED VALUES lists provided below
  241. # 2. NEVER invent, create, or infer values that don't exist in the allowed lists
  242. # 3. NEVER use synonyms, variations, or similar words - ONLY exact matches from the lists
  243. # 4. Each value you return MUST be an EXACT COPY from the allowed list (case-sensitive)
  244. # 5. If you cannot find a perfect match, choose the CLOSEST semantic match from the allowed list
  245. # 6. When unsure, default to the most generic option from the allowed list
  246. # 7. CRITICAL GUARDRAIL: NEVER use a value allowed for one attribute (e.g., T-Shirts from t_shirt_type) as the value for a different attribute (e.g., style_clothing_top). The lists are entirely separate.
  247. # 8. NEVER invent, create, or infer values that don't exist in the allowed lists.
  248. # 9. Special Instruction for T-Shirts: If the product text describes a generic T-shirt (Crew Neck, V-Neck, etc.) which is a basic shirt worn by pulling it over the head, you MUST use the value Pullover. This is the most appropriate generic style available in this specific list.
  249. # =====================
  250. # ATTRIBUTE DEFINITIONS + ALLOWED VALUES
  251. # =====================
  252. # 1️⃣ **style_clothing_top** (Mandatory)
  253. # Defines the overall silhouette or construction of the top.
  254. # Allowed values + definitions:
  255. # - Bandeau: Strapless, tight top covering just bust.
  256. # - Blouse: Loose, dressy top, usually with sleeves.
  257. # - Camisole: Thin-strapped sleeveless lightweight top.
  258. # - Chemise: Straight, loose unshaped top.
  259. # - Cocoon: Rounded, oversized body silhouette.
  260. # - Corset: Structured top shaping waist with boning.
  261. # - Crop: Short top exposing midriff above waist.
  262. # - Cutout: Top with intentionally open fabric areas.
  263. # - Duster: Long open-front flowy layer.
  264. # - Flounce: Top featuring decorative ruffles.
  265. # - Full Zip: Zipper opening entire length front.
  266. # - Guide: Top with reference markers or functional guides.
  267. # - Half Zip: Zipper halfway down front.
  268. # - High-Low: Back hem longer than front.
  269. # - Hoodie: Top with a hood attached.
  270. # - Muscle: Sleeveless with wide armholes, athletic.
  271. # - Peasant: Loose boho style with gathered neckline/sleeves.
  272. # - Peplum: Fitted top with flared waist ruffle.
  273. # - Pullover: Worn by pulling over head; no front opening.
  274. # - Quarter Zip: Short zipper from collar ~¼ length down.
  275. # - Raglan: Sleeves extend to collar with diagonal seams.
  276. # - Ringer: Contrast colored sleeve cuffs + neckline band.
  277. # - Rugby: Thick striped or solid collared sport-style top.
  278. # - Smocked: Gathered elastic shirring for stretch texture.
  279. # - Swing: A-line flare from bust downward.
  280. # - Torsette: Corset-like, bust exposed for layering.
  281. # - Tube: Strapless elongated top; longer than bandeau.
  282. # - Zip-Up: Top with zipper closure (partial/full).
  283. # 2️⃣ **shirt_neck_style** (Mandatory)
  284. # Describes the neckline’s shape and construction.
  285. # Allowed values + definitions:
  286. # - Boat Neck: Wide neckline shoulder-to-shoulder.
  287. # - Caged Neck: Multiple straps forming cage-like design.
  288. # - Choker Neck: Tight high neck like choker band.
  289. # - Collared: Fold-over collar shirt/polo style.
  290. # - Cowl Neck: Draped soft neckline folds.
  291. # - Crew Neck: Close-fitting round classic neckline.
  292. # - Deep V-Neck: Deep V shape below chest level.
  293. # - Drape Neck: Softly draped neckline less than cowl.
  294. # - Funnel Neck: Short stand-up collar not folded.
  295. # - Halter: Straps around neck leaving shoulders bare.
  296. # - Henley: Round neck with button placket.
  297. # - High Neck: More neck coverage, no fold.
  298. # - Hooded: Neck includes a hood.
  299. # - Jewel Neck: High round neck at base of throat.
  300. # - Keyhole Neck: Slit/hole opening at neckline.
  301. # - Lace Neckline: Lace material used around neckline.
  302. # - Mock Neck: Short raised collar, not folded.
  303. # - Notch Neck: Small V cut in round neckline.
  304. # - Open: General wide/open neckline shape.
  305. # - Plunge: Very deep V/U revealing cleavage.
  306. # - Roll Neck: Loose rolled turtleneck style.
  307. # - Round Neck: Standard circular neckline.
  308. # - Round Neckline: Same as Round Neck.
  309. # - Scoop Neck: Wide deep U-shaped neckline.
  310. # - Scrunch Neck: Gathered/scrunched fabric at neckline.
  311. # - Slit Neck: Small vertical slit opening at front.
  312. # - Square Neckline: Straight edged square neck.
  313. # - Sweetheart: Heart-shaped neckline contour.
  314. # - Tie Neck: Ties or bow at neckline.
  315. # - Turtleneck: High folded collar covering full neck.
  316. # - V-Neck: V-shaped neckline (not deep).
  317. # - Wide Neck: Broad neckline toward shoulders.
  318. # - Zip Mock Neck: Mock neck with zipper.
  319. # 3️⃣ **t_shirt_type** (Mandatory)
  320. # Defines the category/design purpose of the T-shirt.
  321. # Allowed values + definitions:
  322. # - Babydoll T-Shirt: Feminine fitted shirt with flared hem.
  323. # - Classic T-Shirt: Standard basic crew tee.
  324. # - Graphic Tees: T-shirts featuring printed graphics.
  325. # - Pocket Tee: T-shirt having a chest pocket.
  326. # - T-Shirts: General type when no specific style is clear.
  327. # - Tank Tops: Sleeveless shirts with shoulder straps.
  328. # =====================
  329. # MANDATORY ATTRIBUTE RULES
  330. # =====================
  331. # MULTI-VALUE ATTRIBUTES (Return exactly 2 most relevant values ONLY):
  332. # {multiple_text}
  333. # SINGLE-VALUE ATTRIBUTES (Return exactly 1 value for all other attributes)
  334. # =====================
  335. # INPUT PRODUCT DETAILS
  336. # =====================
  337. # PRODUCT INFORMATION TO ANALYZE:
  338. # {product_text}
  339. # {additional_instructions}
  340. # =====================
  341. # EXTRACTION STRATEGY
  342. # =====================
  343. # Step 1: Read product text carefully
  344. # Step 2: Identify strong style/type/neck indicators
  345. # Step 3: Choose closest exact match from allowed list
  346. # Step 4: VALIDATE chosen values exist in allowed list
  347. # =====================
  348. # FORBIDDEN ACTIONS
  349. # =====================
  350. # :x: NEVER invent values
  351. # :x: NEVER modify spelling/capitalization
  352. # :x: NEVER use non-allowed synonyms
  353. # :x: NEVER return attributes outside allowed lists
  354. # :x: NEVER use descriptive words like “short sleeve”, “women top”, “graphic print” unless EXACT in list
  355. # =====================
  356. # OUTPUT FORMAT
  357. # =====================
  358. # Return ONLY this JSON structure — no markdown, no explanation:
  359. # {{
  360. # "mandatory": {{
  361. # "style_clothing_top": [{{
  362. # "value": "<exact value from allowed list>",
  363. # "source": "{source_hint}",
  364. # "reason": "Explain specific mapping from product text"
  365. # }}],
  366. # "shirt_neck_style": [{{
  367. # "value": "<exact value from allowed list>",
  368. # "source": "{source_hint}",
  369. # "reason": "Explain specific mapping from product text"
  370. # }}],
  371. # "t_shirt_type": [{{
  372. # "value": "<exact value from allowed list>",
  373. # "source": "{source_hint}",
  374. # "reason": "Explain specific mapping from product text"
  375. # }}]
  376. # }},
  377. # {output_example_additional}
  378. # }}
  379. # =====================
  380. # FINAL VALIDATION BEFORE RESPONDING
  381. # =====================
  382. # ✓ All values EXACT from allowed list
  383. # ✓ Multi-value attributes have exactly 2 values
  384. # ✓ Single-value attributes have exactly 1 value
  385. # ✓ “source” must be one of: {source_hint}
  386. # ✓ Reasons clearly explain mapping
  387. # ✓ Pure JSON — no markdown wrapper
  388. # """
  389. # print(f"Prompt to the llm is: {prompt}")
  390. # payload = {
  391. # "model": model,
  392. # "messages": [
  393. # {"role": "system", "content": "You are a JSON-only extractor and validator. Always provide clear reasoning for your decisions."},
  394. # {"role": "user", "content": prompt},
  395. # ],
  396. # "temperature": 0.0,
  397. # "max_tokens": 2000, # Increased for reasoning
  398. # }
  399. # try:
  400. # raw = ProductAttributeService._call_llm(payload)
  401. # logger.info("Raw LLM response received")
  402. # print(raw)
  403. # cleaned = ProductAttributeService._clean_json(raw)
  404. # parsed = json.loads(cleaned)
  405. # except Exception as exc:
  406. # logger.error(f"LLM failed: {exc}")
  407. # return {
  408. # "mandatory": {
  409. # a: [{
  410. # "value": "Not Specified",
  411. # "source": "llm_error",
  412. # "reason": f"LLM processing failed: {str(exc)}"
  413. # }] for a in mandatory_attrs
  414. # },
  415. # "additional": {} if not extract_additional else {},
  416. # "error": str(exc)
  417. # }
  418. # if use_cache and cache_key:
  419. # SimpleCache.set(cache_key, parsed)
  420. # logger.info(f"CACHE SET {cache_key[:16]}...")
  421. # return parsed
  422. # @staticmethod
  423. # def get_cache_stats() -> Dict:
  424. # return {
  425. # "global_enabled": is_caching_enabled(),
  426. # "result_cache": SimpleCache.get_stats(),
  427. # }
  428. # @staticmethod
  429. # def clear_all_caches():
  430. # SimpleCache.clear()
  431. # logger.info("All caches cleared")
  432. # # IMPORTANT INSTRUCTIONS FOR USER VALUES:
  433. # # 1. Compare the user-entered value with what you find in the product text
  434. # # 2. Evaluate if the user value is correct, partially correct, or incorrect for this product
  435. # # 3. Choose the BEST value (could be user's value, or from allowed list, or inferred)
  436. # # 4. Always provide a "reason" field explaining your decision
  437. # # 5. DO NOT hallucinate - be honest if user's value seems wrong based on product evidence
  438. # # 6. If user's value is not in the allowed list but seems correct, chose the most nearest value from the allowed list with proper reasoning.
  439. import json
  440. import hashlib
  441. import logging
  442. import time
  443. import difflib
  444. import re
  445. from functools import wraps
  446. from typing import Dict, List, Optional, Tuple
  447. import requests
  448. from django.conf import settings
  449. from .llm_load_balancer import call_llm_with_load_balancer
  450. from .cache_config import (
  451. is_caching_enabled,
  452. ENABLE_ATTRIBUTE_EXTRACTION_CACHE,
  453. ATTRIBUTE_CACHE_MAX_SIZE,
  454. )
  455. logger = logging.getLogger(__name__)
  456. # --------------------------------------------------------------------------- #
  457. # CACHES
  458. # --------------------------------------------------------------------------- #
  459. class SimpleCache:
  460. _cache = {}
  461. _max_size = ATTRIBUTE_CACHE_MAX_SIZE
  462. @classmethod
  463. def get(cls, key: str) -> Optional[Dict]:
  464. if not ENABLE_ATTRIBUTE_EXTRACTION_CACHE: return None
  465. return cls._cache.get(key)
  466. @classmethod
  467. def set(cls, key: str, value: Dict):
  468. if not ENABLE_ATTRIBUTE_EXTRACTION_CACHE: return
  469. if len(cls._cache) >= cls._max_size:
  470. items = list(cls._cache.items())
  471. cls._cache = dict(items[int(cls._max_size * 0.2):])
  472. cls._cache[key] = value
  473. @classmethod
  474. def clear(cls): cls._cache.clear()
  475. @classmethod
  476. def get_stats(cls) -> Dict:
  477. return {
  478. "enabled": ENABLE_ATTRIBUTE_EXTRACTION_CACHE,
  479. "size": len(cls._cache),
  480. "max_size": cls._max_size,
  481. "usage_percent": round(len(cls._cache)/cls._max_size*100, 2) if cls._max_size else 0
  482. }
  483. # --------------------------------------------------------------------------- #
  484. # RETRY DECORATOR
  485. # --------------------------------------------------------------------------- #
  486. def retry(max_attempts=3, delay=0.5):
  487. def decorator(f):
  488. @wraps(f)
  489. def wrapper(*args, **kwargs):
  490. last_exc = None
  491. for i in range(max_attempts):
  492. try:
  493. return f(*args, **kwargs)
  494. except Exception as e:
  495. last_exc = e
  496. if i < max_attempts - 1:
  497. wait = delay * (2 ** i)
  498. logger.warning(f"Retry {i+1}/{max_attempts} after {wait}s: {e}")
  499. time.sleep(wait)
  500. raise last_exc or RuntimeError("Retry failed")
  501. return wrapper
  502. return decorator
  503. # --------------------------------------------------------------------------- #
  504. # MAIN SERVICE
  505. # --------------------------------------------------------------------------- #
  506. class ProductAttributeService:
  507. @staticmethod
  508. def combine_product_text(title=None, short_desc=None, long_desc=None, ocr_text=None) -> Tuple[str, Dict[str, str]]:
  509. parts = []
  510. source_map = {}
  511. if title:
  512. t = str(title).strip()
  513. parts.append(f"Title: {t}")
  514. source_map["title"] = t
  515. if short_desc:
  516. s = str(short_desc).strip()
  517. parts.append(f"Description: {s}")
  518. source_map["short_desc"] = s
  519. if long_desc:
  520. l = str(long_desc).strip()
  521. parts.append(f"Details: {l}")
  522. source_map["long_desc"] = l
  523. if ocr_text:
  524. parts.append(f"OCR Text: {ocr_text}")
  525. source_map["ocr_text"] = ocr_text
  526. combined = "\n".join(parts).strip()
  527. return (combined or "No product information", source_map)
  528. @staticmethod
  529. def _cache_key(product_text: str, mandatory_attrs: Dict, extract_additional: bool, multiple: List[str], user_values: Dict = None) -> str:
  530. payload = {
  531. "text": product_text,
  532. "attrs": mandatory_attrs,
  533. "extra": extract_additional,
  534. "multiple": sorted(multiple),
  535. "user_values": user_values or {}
  536. }
  537. return f"attr_{hashlib.md5(json.dumps(payload, sort_keys=True).encode()).hexdigest()}"
  538. @staticmethod
  539. def _clean_json(text: str) -> str:
  540. text = text.strip()
  541. if "```json" in text:
  542. text = text.split("```json", 1)[1].split("```", 1)[0]
  543. elif "```" in text:
  544. parts = text.split("```", 2)
  545. if len(parts) > 1:
  546. text = parts[1]
  547. start = text.find("{")
  548. end = text.rfind("}") + 1
  549. if start == -1 or end <= start:
  550. raise ValueError("No JSON object found in LLM response")
  551. text = text[start:end]
  552. return text.strip()
  553. @staticmethod
  554. def _find_best_match(value: str, allowed_list: List[str]) -> Optional[str]:
  555. if not value or not allowed_list:
  556. return None
  557. value_lower = value.lower()
  558. # 1. Exact match
  559. for allowed in allowed_list:
  560. if allowed.lower() == value_lower:
  561. return allowed
  562. # 2. Substring match
  563. if len(value_lower) > 3:
  564. for allowed in allowed_list:
  565. if value_lower in allowed.lower() or allowed.lower() in value_lower:
  566. return allowed
  567. # 3. Fuzzy match
  568. matches = difflib.get_close_matches(value, allowed_list, n=1, cutoff=0.5)
  569. if matches:
  570. return matches[0]
  571. return None
  572. @staticmethod
  573. def format_visual_attributes(visual_attributes: Dict) -> Dict:
  574. formatted = {}
  575. for key, value in visual_attributes.items():
  576. if isinstance(value, list):
  577. formatted[key] = [{"value": str(item), "source": "image"} for item in value]
  578. elif isinstance(value, dict):
  579. nested = {}
  580. for sub_key, sub_val in value.items():
  581. if isinstance(sub_val, list):
  582. nested[sub_key] = [{"value": str(v), "source": "image"} for v in sub_val]
  583. else:
  584. nested[sub_key] = [{"value": str(sub_val), "source": "image"}]
  585. formatted[key] = nested
  586. else:
  587. formatted[key] = [{"value": str(value), "source": "image"}]
  588. return formatted
  589. @staticmethod
  590. @retry(max_attempts=3, delay=0.5)
  591. def _call_llm(payload: dict) -> str:
  592. return call_llm_with_load_balancer(payload)
  593. @staticmethod
  594. def extract_attributes(
  595. product_text: str,
  596. mandatory_attrs: Dict[str, List[str]],
  597. source_map: Dict[str, str] = None,
  598. model: str = None,
  599. extract_additional: bool = True,
  600. multiple: Optional[List[str]] = None,
  601. use_cache: Optional[bool] = None,
  602. user_entered_values: Optional[Dict[str, str]] = None,
  603. ) -> dict:
  604. if model is None: model = settings.SUPPORTED_MODELS[0]
  605. if multiple is None: multiple = []
  606. if source_map is None: source_map = {}
  607. if user_entered_values is None: user_entered_values = {}
  608. if use_cache is None: use_cache = ENABLE_ATTRIBUTE_EXTRACTION_CACHE
  609. if not is_caching_enabled(): use_cache = False
  610. cache_key = None
  611. if use_cache:
  612. cache_key = ProductAttributeService._cache_key(
  613. product_text, mandatory_attrs, extract_additional, multiple, user_entered_values
  614. )
  615. cached = SimpleCache.get(cache_key)
  616. if cached:
  617. logger.info(f"CACHE HIT {cache_key[:16]}...")
  618. return cached
  619. # --------------------------- PREPARE DATA ---------------------------
  620. allowed_lines = [f"{attr}: {', '.join(vals)}" for attr, vals in mandatory_attrs.items()]
  621. allowed_text_dynamic = "\n".join(allowed_lines)
  622. requested_keys = list(mandatory_attrs.keys())
  623. product_text_lower = product_text.lower()
  624. allowed_sources = list(source_map.keys()) + ["title", "description", "inferred"]
  625. source_hint = "|".join(allowed_sources)
  626. multiple_text = f"\nMULTIPLE ALLOWED FOR: {', '.join(multiple)}" if multiple else ""
  627. user_values_section = ""
  628. if user_entered_values:
  629. user_lines = [f" - {attr}: {value}" for attr, value in user_entered_values.items()]
  630. user_values_section = f"USER MANUALLY ENTERED VALUES:\n{chr(10).join(user_lines)}\nIMPORTANT: Validate user values against product text. If correct, select them."
  631. if extract_additional:
  632. additional_instructions = "For the 'additional' section, identify other key attributes."
  633. output_example_additional = """ "additional": { "Attribute_Name": [{"value": "Extracted Value", "source": "<source>", "reason": "Brief reason"}] }"""
  634. else:
  635. additional_instructions = "The 'additional' section must be an empty object {}."
  636. output_example_additional = ' "additional": {}'
  637. # --------------------------- PROMPT SELECTION ---------------------------
  638. keys_set = set(requested_keys)
  639. # 1. TABLE LAMPS
  640. if any(k in keys_set for k in ["lamp_type", "switch_type", "power_source", "Recommended_Room", "Age_Group"]):
  641. logger.info("Selecting TABLE LAMP prompt.")
  642. definitions_block = """
  643. === TABLE LAMP ATTRIBUTE DEFINITIONS ===
  644. 1. **lamp_type**: 'Stick', 'Table', 'Novelty', 'Desk/Task', 'Tiffany'.
  645. 2. **switch_type**: 'Pull Chain', 'Rocker', 'Rotary Socket', 'Touch', 'Push Button'.
  646. 3. **power_source**: 'Corded Electric', 'Battery Powered', 'USB'.
  647. 4. **Age_Group**: 'Child' (Kids/Nursery), 'Adult', 'Teen'.
  648. 5. **Recommended_Room**: 'Kids Room' (if child/themed), 'Bedroom', 'Office'.
  649. """
  650. prompt = f"""
  651. You are an expert Home Decor Attribute Extractor.
  652. {user_values_section}
  653. {definitions_block}
  654. === INSTRUCTIONS ===
  655. 1. Extract EXACT values from Allowed List.
  656. 2. **Mapping**:
  657. - "Kids/Nursery" -> Age_Group='Child', Room='Kids Room'.
  658. - "Plug-in/Cord" -> power_source='Corded Electric'.
  659. 3. **Brevity**: Keep "reason" short.
  660. === REQUIRED ATTRIBUTES & ALLOWED VALUES ===
  661. {allowed_text_dynamic}
  662. {multiple_text}
  663. === INPUT DATA ===
  664. {product_text}
  665. {additional_instructions}
  666. === OUTPUT FORMAT ===
  667. Return pure JSON:
  668. {{
  669. "mandatory": {{
  670. "attribute_name": [{{
  671. "value": "<exact value from allowed list>",
  672. "source": "{source_hint}",
  673. "reason": "Brief evidence"
  674. }}]
  675. }},
  676. {output_example_additional}
  677. }}
  678. """
  679. # 2. T-SHIRTS / CLOTHING
  680. elif any(k in keys_set for k in ["style_clothing_top", "shirt_neck_style", "t_shirt_type"]):
  681. logger.info("Selecting T-SHIRT prompt.")
  682. definitions_block = """
  683. === CLOTHING ATTRIBUTE HIERARCHY ===
  684. 1. **style_clothing_top**:
  685. - **Specifics**: 'Blouse', 'Camisole', 'Peplum', 'Crop', 'Tank', 'Tube', 'Tunic'.
  686. - **Generic**: **'Pullover'** (Standard T-shirts, no zipper/buttons).
  687. 2. **shirt_neck_style**:
  688. - 'Round Neck' (Standard), 'Crew Neck', 'V-Neck'.
  689. 3. **t_shirt_type**:
  690. - **Graphic Tees** (Has print/image).
  691. - **Classic T-Shirt** (Solid/Simple).
  692. - **Pocket Tee** (Has pocket).
  693. """
  694. prompt = f"""
  695. You are an expert Fashion Attribute Extractor.
  696. {user_values_section}
  697. {definitions_block}
  698. === INSTRUCTIONS ===
  699. 1. Extract attributes based on definitions.
  700. 2. **Hierarchy**: Check for Specific Styles ('Blouse', 'Peplum') first.
  701. 3. **Defaulting**: If it's a standard Tee/Top with no specific style -> Select **'Pullover'**.
  702. 4. **Anti-Hallucination**: Do NOT select 'Peasant' or 'Loose' unless explicitly stated. 'Graphic' is NOT a style.
  703. === REQUIRED ATTRIBUTES & ALLOWED VALUES ===
  704. {allowed_text_dynamic}
  705. {multiple_text}
  706. === INPUT DATA ===
  707. {product_text}
  708. {additional_instructions}
  709. === OUTPUT FORMAT ===
  710. Return pure JSON:
  711. {{
  712. "mandatory": {{
  713. "attribute_name": [{{
  714. "value": "<exact value from allowed list>",
  715. "source": "{source_hint}",
  716. "reason": "Brief evidence"
  717. }}]
  718. }},
  719. {output_example_additional}
  720. }}
  721. """
  722. else:
  723. logger.info("Selecting GENERIC prompt.")
  724. prompt = f"""
  725. Extract attributes.
  726. {user_values_section}
  727. === REQUIRED ATTRIBUTES & ALLOWED VALUES ===
  728. {allowed_text_dynamic}
  729. {multiple_text}
  730. === INPUT DATA ===
  731. {product_text}
  732. {additional_instructions}
  733. === OUTPUT FORMAT ===
  734. Return pure JSON:
  735. {{
  736. "mandatory": {{
  737. "attribute_name": [{{
  738. "value": "<exact value from allowed list>",
  739. "source": "{source_hint}",
  740. "reason": "Brief reasoning"
  741. }}]
  742. }},
  743. {output_example_additional}
  744. }}
  745. """
  746. payload = {
  747. "model": model,
  748. "messages": [
  749. {"role": "system", "content": "You are a strict JSON extractor."},
  750. {"role": "user", "content": prompt},
  751. ],
  752. "temperature": 0.0,
  753. "max_tokens": 800,
  754. }
  755. try:
  756. raw = ProductAttributeService._call_llm(payload)
  757. cleaned = ProductAttributeService._clean_json(raw)
  758. parsed = json.loads(cleaned)
  759. # --------------------------- VALIDATION & RECOVERY ---------------------------
  760. if "mandatory" in parsed and isinstance(parsed["mandatory"], dict):
  761. filtered_mandatory = {}
  762. for key, items in parsed["mandatory"].items():
  763. if key not in mandatory_attrs: continue
  764. valid_options = mandatory_attrs[key]
  765. validated_items = []
  766. for item in items:
  767. raw_val = item.get("value", "").strip()
  768. # 1. Try Match
  769. best_match = ProductAttributeService._find_best_match(raw_val, valid_options)
  770. # 2. Guardrails
  771. if best_match in ["Peasant", "Chemise", "Corset", "Bandeau"]:
  772. if best_match.lower() not in product_text_lower:
  773. best_match = None
  774. if best_match:
  775. item["value"] = best_match
  776. validated_items.append(item)
  777. else:
  778. # 3. Fallback Text Search
  779. found_in_text = None
  780. for opt in valid_options:
  781. if opt.lower() in product_text_lower:
  782. found_in_text = opt
  783. break
  784. if found_in_text:
  785. item["value"] = found_in_text
  786. item["source"] = "inferred_from_text"
  787. item["reason"] = f"Found keyword '{found_in_text}'."
  788. validated_items.append(item)
  789. else:
  790. # 4. LOGIC OVERRIDES (The Fix)
  791. if key == "style_clothing_top" and "Pullover" in valid_options:
  792. if any(x in product_text_lower for x in ["t-shirt", "tee", "top", "shirt"]):
  793. item["value"] = "Pullover"
  794. item["reason"] = "Fallback: Standard Tee implies Pullover."
  795. validated_items.append(item)
  796. elif key == "shirt_neck_style" and "Round Neck" in valid_options:
  797. item["value"] = "Round Neck"
  798. item["reason"] = "Fallback: Defaulting to Round Neck."
  799. validated_items.append(item)
  800. elif key == "t_shirt_type" and "T-Shirts" in valid_options:
  801. item["value"] = "T-Shirts"
  802. item["reason"] = "Fallback: Generic T-Shirt."
  803. validated_items.append(item)
  804. if validated_items:
  805. filtered_mandatory[key] = validated_items
  806. else:
  807. # =========================================================
  808. # FINAL LAST RESORT: FORCED DEFAULT (NO "NOT SPECIFIED")
  809. # =========================================================
  810. default_val = None
  811. # T-Shirt Defaults
  812. if key == "style_clothing_top" and "Pullover" in valid_options:
  813. default_val = "Pullover"
  814. elif key == "shirt_neck_style" and "Round Neck" in valid_options:
  815. default_val = "Round Neck"
  816. elif key == "t_shirt_type" and "T-Shirts" in valid_options:
  817. default_val = "T-Shirts"
  818. elif key == "t_shirt_type" and "Classic T-Shirt" in valid_options:
  819. default_val = "Classic T-Shirt"
  820. # Lamp Defaults
  821. elif key == "power_source" and "Corded Electric" in valid_options:
  822. default_val = "Corded Electric" # Most common
  823. if default_val:
  824. filtered_mandatory[key] = [{
  825. "value": default_val,
  826. "source": "system_default",
  827. "reason": "Forced valid default to avoid Not Specified."
  828. }]
  829. else:
  830. # Truly nothing works
  831. filtered_mandatory[key] = [{
  832. "value": "Not Specified",
  833. "source": "system",
  834. "reason": "No match found and no safe default available."
  835. }]
  836. parsed["mandatory"] = filtered_mandatory
  837. except Exception as exc:
  838. logger.error(f"LLM failed: {exc}")
  839. return {
  840. "mandatory": {
  841. a: [{
  842. "value": "Not Specified",
  843. "source": "llm_error",
  844. "reason": f"LLM processing failed: {str(exc)}"
  845. }] for a in mandatory_attrs
  846. },
  847. "additional": {} if not extract_additional else {},
  848. "error": str(exc)
  849. }
  850. if use_cache and cache_key:
  851. SimpleCache.set(cache_key, parsed)
  852. return parsed
  853. @staticmethod
  854. def get_cache_stats() -> Dict:
  855. return {
  856. "global_enabled": is_caching_enabled(),
  857. "result_cache": SimpleCache.get_stats(),
  858. }
  859. @staticmethod
  860. def clear_all_caches():
  861. SimpleCache.clear()
  862. logger.info("All caches cleared")