Here are the required steps before you used python crawler
- python 3+
- beautiful soup module
pip install
beautifulsoup4
sudo
apt-get install
python-bs4 //
for
linux
- Selenium web driver
pip install
-U selenium
- Chrome driver – https://sites.google.com/a/chromium.org/chromedriver/downloads
from bs4 import BeautifulSoup from urllib.request import urlopen from selenium import webdriver import urllib import requests import openpyxl import time import inspect def CallVariable(varname): callers_local_vars = inspect.currentframe().f_back.f_locals.items() stringvar = [k for k, v in callers_local_vars if v is varname][0] return stringvar driver = webdriver.Chrome('/Users/ADMIN/Desktop/dev/chromedriver') # login info login_domain = {domain} # if need login_id = {login id} # if need login_passwd = {login password} # if need url ={target url} driver.get(url) driver.implicitly_wait(5) driver.execute_script("document.getElementById('login-popup').style.display = 'block';") driver.find_element_by_xpath('//*[@id="login-domain"]').send_keys(login_domain) driver.find_element_by_xpath('//*[@id="login-id"]').send_keys(login_id) driver.find_element_by_xpath('//*[@id="login-pwd"]').clear() driver.find_element_by_xpath('//*[@id="login-pwd"]').send_keys(login_passwd) driver.find_element_by_xpath('//*[@id="login-popup"]/div[2]/form[2]/input[4]').click() s = requests.Session() headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36' } s.headers.update(headers) for cookie in driver.get_cookies(): c = {cookie['name'] : cookie['value']} s.cookies.update(c) driver.execute_script("document.getElementById('passwd_keep').style.display = 'none';") driver.execute_script("document.getElementById('bxslider2_bg').style.display = 'none';") ez_prd_manage_url = 'http://ca09.ezadmin.co.kr/template25.htm?template=' #Manage Product tab prd_management = ['C200','C100','CX10','C600','C620','C242','CW00','CF00','CI00','CI10','CI20','CI30','CO00','CO01','CO02'] baseInfo_management =['B100','B120','B121','B200','B203','B400','B700','BP00','BA00','BB00','BC00','BQ00','BD00','BL00','BL10','BN00','BN10','BG02','BJ00','BM00','BM10','BR00'] order_management =['DC10','DF02','DL02','DE00','DS00','DG00','DT10','DR00','DR01','D500','DF00','DZ00','DB00','DB01','DB10','DB11','DM00','DK00','DQ00','DQ01','DQ02','DD00','DD03','DD06','S600','S700'] cs_management = ['E300','EN10','EN00','EN03','EY00','EL00','EL02','E210','E200','E220','EO00','EC00','EC10','E420','EV00','E804','E807','ET00','E809','EP00'] stock_management = ['IG00','I100','IC00','IE00','I210','IA00','IZ00','ID00','IJ00','IU00','IV00','I400','IO00','IM00','IM20','IM30','I300','I310','I330','IY00','S500','S502','S501','I700','I710','IU10','IH00','IH30','IH10','I600','IH60','IT00','IT10','IT40','IT20','IT30','IP00','IP10'] settle_management = ['F304','FX00','F308','F316','FV30','FL10','FL30','FL40','FP00','F200','FV10','FV20','FH00','FI00','FH10','FG00','F500','FB00','FS00','F100','FW00','FW20','FW10'] cs_url_local ="C:/Users/ADMIN/Desktop/C_S_local.html" cs_url_popup = 'http://ca09.ezadmin.co.kr/popup25.htm?template=E900&_si=ltfogt3avms5bfdlv7pqdkj1m7' cs_return_req_url ="http://ca09.ezadmin.co.kr/popup25.htm?template=EX00" # parse target list params_list = [prd_management,baseInfo_management,order_management,cs_management,stock_management,settle_management] popups_list = [cs_url_popup,cs_return_req_url] for param_list in params_list: wb = openpyxl.Workbook() for param in param_list: response = s.get(ez_prd_manage_url+param).text soup = BeautifulSoup(response,features='html.parser') for script in soup(["script", "style"]): # kill all script and style elements script.extract() # rip it out pure_text = soup.get_text() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in pure_text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines final_text = '\n'.join(chunk for chunk in chunks if chunk) line_break = final_text.split('\n') i = 2 #create excel sheet ws = wb.create_sheet(title=param, index=0) ws['A1'] = 'KR' for input in line_break: if(input == '-'): continue ws['A'+str(i)] = input i+=1 #save excel wb.save('C:/Users/ADMIN/Desktop/dev/ezadmin-trans/ez-admin-KR_'+CallVariable(param_list)+'.xlsx') #in case of popup html for popup_list in popups_list: wb = openpyxl.Workbook() response = s.get(popup_list).text soup = BeautifulSoup(response,features='html.parser') for script in soup(["script", "style"]): # kill all script and style elements script.extract() # rip it out pure_text = soup.get_text() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in pure_text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines final_text = '\n'.join(chunk for chunk in chunks if chunk) line_break = final_text.split('\n') i = 2 #create excel sheet ws = wb.create_sheet(title=CallVariable(popup_list), index=0) ws['A1'] = 'KR' for input in line_break: if(input == '-'): continue ws['A'+str(i)] = input i+=1 #save excel wb.save('C:/Users/ADMIN/Desktop/dev/{path}') #in case of crawling local html # when it's not working in network, u can download the html file and crawling in local environement # wb = openpyxl.Workbook() # f = open(cs_url,"r",encoding="utf8") # soup = BeautifulSoup(f,features='html.parser') # for script in soup(["script", "style"]): # kill all script and style elements # script.extract() # rip it out # pure_text = soup.get_text() # # break into lines and remove leading and trailing space on each # lines = (line.strip() for line in pure_text.splitlines()) # chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # # drop blank lines # final_text = '\n'.join(chunk for chunk in chunks if chunk) # line_break = final_text.split('\n') # i = 2 # #create excel sheet # ws = wb.create_sheet(title=CallVariable(cs_url), index=0) # ws['A1'] = 'KR' # for input in line_break: # if(input == '-'): # continue # ws['A'+str(i)] = input # i+=1 # #save excel # wb.save('C:/Users/ADMIN/Desktop/dev/{path})