import functools from functools import lru_cache from curl_cffi import requests from bs4 import BeautifulSoup import datetime from frozendict import frozendict from . import utils, cache import threading from .exceptions import YFRateLimitError cache_maxsize = 64 def lru_cache_freezeargs(func): """ Decorator transforms mutable dictionary and list arguments into immutable types Needed so lru_cache can cache method calls what has dict or list arguments. """ @functools.wraps(func) def wrapped(*args, **kwargs): args = tuple([frozendict(arg) if isinstance(arg, dict) else arg for arg in args]) kwargs = {k: frozendict(v) if isinstance(v, dict) else v for k, v in kwargs.items()} args = tuple([tuple(arg) if isinstance(arg, list) else arg for arg in args]) kwargs = {k: tuple(v) if isinstance(v, list) else v for k, v in kwargs.items()} return func(*args, **kwargs) # copy over the lru_cache extra methods to this wrapper to be able to access them # after this decorator has been applied wrapped.cache_info = func.cache_info wrapped.cache_clear = func.cache_clear return wrapped class SingletonMeta(type): """ Metaclass that creates a Singleton instance. """ _instances = {} _lock = threading.Lock() def __call__(cls, *args, **kwargs): with cls._lock: if cls not in cls._instances: instance = super().__call__(*args, **kwargs) cls._instances[cls] = instance else: # Update the existing instance if 'session' in kwargs or (args and len(args) > 0): session = kwargs.get('session') if 'session' in kwargs else args[0] cls._instances[cls]._set_session(session) if 'proxy' in kwargs or (args and len(args) > 1): proxy = kwargs.get('proxy') if 'proxy' in kwargs else args[1] cls._instances[cls]._set_proxy(proxy) return cls._instances[cls] class YfData(metaclass=SingletonMeta): """ Have one place to retrieve data from Yahoo API in order to ease caching and speed up operations. Singleton means one session one cookie shared by all threads. """ def __init__(self, session=None, proxy=None): self._crumb = None self._cookie = None # Default to using 'basic' strategy self._cookie_strategy = 'basic' # If it fails, then fallback method is 'csrf' # self._cookie_strategy = 'csrf' self._cookie_lock = threading.Lock() self._session, self._proxy = None, None self._set_session(session or requests.Session(impersonate="chrome")) self._set_proxy(proxy) def _set_session(self, session): if session is None: return with self._cookie_lock: self._session = session if self._proxy is not None: self._session.proxies = self._proxy try: self._session.cache except AttributeError: # Not caching self._session_is_caching = False else: # Is caching. This is annoying. # Can't simply use a non-caching session to fetch cookie & crumb, # because then the caching-session won't have cookie. self._session_is_caching = True from requests_cache import DO_NOT_CACHE self._expire_after = DO_NOT_CACHE def _set_proxy(self, proxy=None): with self._cookie_lock: if proxy is not None: proxy = {'http': proxy, 'https': proxy} if isinstance(proxy, str) else proxy else: proxy = {} self._proxy = proxy self._session.proxies = proxy def _set_cookie_strategy(self, strategy, have_lock=False): if strategy == self._cookie_strategy: return if not have_lock: self._cookie_lock.acquire() try: if self._cookie_strategy == 'csrf': utils.get_yf_logger().debug(f'toggling cookie strategy {self._cookie_strategy} -> basic') self._session.cookies.clear() self._cookie_strategy = 'basic' else: utils.get_yf_logger().debug(f'toggling cookie strategy {self._cookie_strategy} -> csrf') self._cookie_strategy = 'csrf' self._cookie = None self._crumb = None except Exception: self._cookie_lock.release() raise if not have_lock: self._cookie_lock.release() def _save_session_cookies(self): try: cache.get_cookie_cache().store('csrf', self._session.cookies) except Exception: return False return True def _load_session_cookies(self): cookie_dict = cache.get_cookie_cache().lookup('csrf') if cookie_dict is None: return False # Periodically refresh, 24 hours seems fair. if cookie_dict['age'] > datetime.timedelta(days=1): return False self._session.cookies.update(cookie_dict['cookie']) utils.get_yf_logger().debug('loaded persistent cookie') def _save_cookie_basic(self, cookie): try: cache.get_cookie_cache().store('basic', cookie) except Exception: return False return True def _load_cookie_basic(self): cookie_dict = cache.get_cookie_cache().lookup('basic') if cookie_dict is None: return None # Periodically refresh, 24 hours seems fair. if cookie_dict['age'] > datetime.timedelta(days=1): return None utils.get_yf_logger().debug('loaded persistent cookie') return cookie_dict['cookie'] def _get_cookie_basic(self, timeout=30): # To avoid infinite recursion, do NOT use self.get() # - 'allow_redirects' copied from @psychoz971 solution - does it help USA? self._session.get( url='https://fc.yahoo.com', timeout=timeout, allow_redirects=True) def _get_crumb_basic(self, timeout=30): if self._crumb is not None: utils.get_yf_logger().debug('reusing crumb') return self._crumb self._get_cookie_basic() # - 'allow_redirects' copied from @psychoz971 solution - does it help USA? get_args = { 'url': "https://query1.finance.yahoo.com/v1/test/getcrumb", 'timeout': timeout, 'allow_redirects': True } if self._session_is_caching: get_args['expire_after'] = self._expire_after crumb_response = self._session.get(**get_args) else: crumb_response = self._session.get(**get_args) self._crumb = crumb_response.text if self._crumb is None or '' in self._crumb: utils.get_yf_logger().debug("Didn't receive crumb") return None utils.get_yf_logger().debug(f"crumb = '{self._crumb}'") return self._crumb @utils.log_indent_decorator def _get_cookie_and_crumb_basic(self, timeout): self._get_cookie_basic(timeout) crumb = self._get_crumb_basic(timeout) return crumb def _get_cookie_csrf(self, timeout): if self._cookie is not None: utils.get_yf_logger().debug('reusing cookie') return True elif self._load_session_cookies(): utils.get_yf_logger().debug('reusing persistent cookie') self._cookie = True return True base_args = { 'timeout': timeout} get_args = {**base_args, 'url': 'https://guce.yahoo.com/consent'} try: if self._session_is_caching: get_args['expire_after'] = self._expire_after response = self._session.get(**get_args) else: response = self._session.get(**get_args) except requests.exceptions.ChunkedEncodingError: # No idea why happens, but handle nicely so can switch to other cookie method. utils.get_yf_logger().debug('_get_cookie_csrf() encountering requests.exceptions.ChunkedEncodingError, aborting') return False soup = BeautifulSoup(response.content, 'html.parser') csrfTokenInput = soup.find('input', attrs={'name': 'csrfToken'}) if csrfTokenInput is None: utils.get_yf_logger().debug('Failed to find "csrfToken" in response') return False csrfToken = csrfTokenInput['value'] utils.get_yf_logger().debug(f'csrfToken = {csrfToken}') sessionIdInput = soup.find('input', attrs={'name': 'sessionId'}) sessionId = sessionIdInput['value'] utils.get_yf_logger().debug(f"sessionId='{sessionId}") originalDoneUrl = 'https://finance.yahoo.com/' namespace = 'yahoo' data = { 'agree': ['agree', 'agree'], 'consentUUID': 'default', 'sessionId': sessionId, 'csrfToken': csrfToken, 'originalDoneUrl': originalDoneUrl, 'namespace': namespace, } post_args = {**base_args, 'url': f'https://consent.yahoo.com/v2/collectConsent?sessionId={sessionId}', 'data': data} get_args = {**base_args, 'url': f'https://guce.yahoo.com/copyConsent?sessionId={sessionId}', 'data': data} try: if self._session_is_caching: post_args['expire_after'] = self._expire_after get_args['expire_after'] = self._expire_after self._session.post(**post_args) self._session.get(**get_args) else: self._session.post(**post_args) self._session.get(**get_args) except requests.exceptions.ChunkedEncodingError: # No idea why happens, but handle nicely so can switch to other cookie method. utils.get_yf_logger().debug('_get_cookie_csrf() encountering requests.exceptions.ChunkedEncodingError, aborting') self._cookie = True self._save_session_cookies() return True @utils.log_indent_decorator def _get_crumb_csrf(self, timeout=30): # Credit goes to @bot-unit #1729 if self._crumb is not None: utils.get_yf_logger().debug('reusing crumb') return self._crumb if not self._get_cookie_csrf(timeout): # This cookie stored in session return None get_args = { 'url': 'https://query2.finance.yahoo.com/v1/test/getcrumb', 'timeout': timeout} if self._session_is_caching: get_args['expire_after'] = self._expire_after r = self._session.get(**get_args) else: r = self._session.get(**get_args) self._crumb = r.text if self._crumb is None or '' in self._crumb or self._crumb == '': utils.get_yf_logger().debug("Didn't receive crumb") return None utils.get_yf_logger().debug(f"crumb = '{self._crumb}'") return self._crumb @utils.log_indent_decorator def _get_cookie_and_crumb(self, timeout=30): crumb, strategy = None, None utils.get_yf_logger().debug(f"cookie_mode = '{self._cookie_strategy}'") with self._cookie_lock: if self._cookie_strategy == 'csrf': crumb = self._get_crumb_csrf() if crumb is None: # Fail self._set_cookie_strategy('basic', have_lock=True) crumb = self._get_cookie_and_crumb_basic(timeout) else: # Fallback strategy crumb = self._get_cookie_and_crumb_basic(timeout) if crumb is None: # Fail self._set_cookie_strategy('csrf', have_lock=True) crumb = self._get_crumb_csrf() strategy = self._cookie_strategy return crumb, strategy @utils.log_indent_decorator def get(self, url, params=None, timeout=30): return self._make_request(url, request_method = self._session.get, params=params, timeout=timeout) @utils.log_indent_decorator def post(self, url, body, params=None, timeout=30): return self._make_request(url, request_method = self._session.post, body=body, params=params, timeout=timeout) @utils.log_indent_decorator def _make_request(self, url, request_method, body=None, params=None, timeout=30): # Important: treat input arguments as immutable. if len(url) > 200: utils.get_yf_logger().debug(f'url={url[:200]}...') else: utils.get_yf_logger().debug(f'url={url}') utils.get_yf_logger().debug(f'params={params}') if params is None: params = {} if 'crumb' in params: raise Exception("Don't manually add 'crumb' to params dict, let data.py handle it") crumb, strategy = self._get_cookie_and_crumb() if crumb is not None: crumbs = {'crumb': crumb} else: crumbs = {} request_args = { 'url': url, 'params': {**params, **crumbs}, 'timeout': timeout } if body: request_args['json'] = body response = request_method(**request_args) utils.get_yf_logger().debug(f'response code={response.status_code}') if response.status_code >= 400: # Retry with other cookie strategy if strategy == 'basic': self._set_cookie_strategy('csrf') else: self._set_cookie_strategy('basic') crumb, strategy = self._get_cookie_and_crumb(timeout) request_args['params']['crumb'] = crumb response = request_method(**request_args) utils.get_yf_logger().debug(f'response code={response.status_code}') # Raise exception if rate limited if response.status_code == 429: raise YFRateLimitError() return response @lru_cache_freezeargs @lru_cache(maxsize=cache_maxsize) def cache_get(self, url, params=None, timeout=30): return self.get(url, params, timeout) def get_raw_json(self, url, params=None, timeout=30): utils.get_yf_logger().debug(f'get_raw_json(): {url}') response = self.get(url, params=params, timeout=timeout) response.raise_for_status() return response.json()