Dowemo

When using python website content, it's easy to be restricted by the mechanism, and an important measure to break the mechanism is. We can find many ip agents on the network, but stable ip proxy costs are higher. So it's necessary to build your own proxy pool using free agents. This article describes how to build your own ip proxy pool.

Two free ip agents are recommended:
http://www. Haoip. Cc.
http://www. Xicidaili. Com.

This paper builds the proxy pool.

Required modules for boot programs


import requests


import re


import random


import time


Crawl the ip address provided by the web site to the array ip_list


url = 'www.haoip.cc/tiqu.htm'


ip_list =[]


ip_list_washed = []



def get_ip_list(url):


 html = requests.get(url)


 ip_listn = re.findall(r'r/>(.*?)<b', html.text, re.S)


 for ipn in ip_listn:


 ip = re.sub('n', '', ipn) # 去除换行符


 ip_list.append(ip.strip())


Detect ip stored by ip_list


# 由于我们使用www.baidu.com进行ip代理有效性的检测,因此先设置headers


user_agent_list = [


"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 


"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 


"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 


"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",


"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 


"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 


"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 


"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 


"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 


"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 


"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 


"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 


"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 


"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 


"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 


"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 


"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 


"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"


]



def check_ip(ip):


 test_url = 'https://www.baidu.com'


 proxy = {'http': ip}


 user_agent = random.choice(user_agent_list)


 headers = {'User-Agent': user_agent}


 try:


 response = requests.get(test_url, headers=headers, proxies=proxy, timeout=5)


 time.sleep(5)


 if response.status_code == 200:


 return True


 else:


 return False


 except Exception as e:


 print(e)


 return False


 time.sleep(5)


Complete code

IPProxyPool. Py.



import requests


import re


import random


import time



class IPProxyPool:



 # 初始化,定义一个空数组ip_list用于存储ip代理


 def __init__(self):


 self.ip_list = []


 # self.ip_list_washed = []


 self.user_agent_list = [


"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",


"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",


"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",


"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",


"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",


"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",


"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",


"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",


"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",


"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",


"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",


"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",


"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",


"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",


"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",


"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",


"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",


"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"


 ]



 def get_ip_list(self, haoip_url = 'http://www.haoip.cc/tiqu.htm'):


 html = requests.get(haoip_url)


 ip_listn = re.findall(r'r/>(.*?)<b', html.text, re.S)


 for ipn in ip_listn:


 ip = re.sub('n', '', ipn)


 # ip代理有效性检验


 statu = self.check_ip(ip)


 print(statu)


 if statu:


 # 将有效ip代理存储至数组ip_list中


 self.ip_list.append(ip.strip())


 print(self.ip_list)



 def check_ip(self, ip):


 test_url = 'https://www.baidu.com'


 proxy = {'http': ip}


 user_agent = random.choice(self.user_agent_list)


 headers = {'User-Agent': user_agent}


 try:


 response = requests.get(test_url, headers=headers, proxies=proxy, timeout=5)


 time.sleep(5)


 if response.status_code == 200:


 return True


 else:


 return False


 except Exception as e:


 print(e)


 return False


 time.sleep(5)



IPProxyPool = IPProxyPool()


IPProxyPool.get_ip_list()


Full code please click on github ( continue to add new features and optimizations, if you've help, please your )! )
My blog: orient. ( blog is still in perfect. ).




Copyright © 2011 Dowemo All rights reserved.    Creative Commons   AboutUs