import urllib #my url here stored url htmlfile = urllib.urlopen(url) htmltext = htmlfile.read() print(htmltext)
i'm trying source code url
i source code different page saying 2 things; please enable cookies , domain has banned access based on browser's signature
is there way knows of source code when browser knows not on page?
you may have set url opener
def createopener(self): handlers = [] cj = mycookiejar(); cj.set_policy(cookielib.defaultcookiepolicy(rfc2965=true)) cjhdr = urllib2.httpcookieprocessor(cj) handlers.append(cjhdr) opener = urllib2.build_opener(*handlers) opener.addheaders = [('user-agent', self.getuseragent()), ('host', 'google.com')] return opener
where cookie jar is
class mycookiejar(cookielib.cookiejar): def _cookie_from_cookie_tuple(self, tup, request): name, value, standard, rest = tup version = standard.get('version', none) if version not none: version = version.replace('"', '') standard["version"] = version return cookielib.cookiejar._cookie_from_cookie_tuple(self, tup, request)
at point create opener , fetch data reading url handler like:
def fetchurl(self, url, data=none, headers={}): request = urllib2.request(url, data, headers) self.opener = self.createopener() urlhandle = self.opener.open(request) return urlhandle.read()
it's idea have user-agent
list , read it:
with open(ffpath) f: user_agents_list = f.read().splitlines()
and random 1 it
index = random.randint(0,len(user_agents_list)-1) ua=user_agents_list[index]
to have list of user agent take @ here.
this have , idea without external framework.