[Python][bs4] Web Crawler

Posted on 2020년 11월 14일 by mingsayz Python Web 0

Here are the required steps before you used python crawler

python 3+
beautiful soup module
pip install beautifulsoup4
sudo apt-get install python-bs4 // for linux
Selenium web driver
pip install -U selenium
Chrome driver – https://sites.google.com/a/chromium.org/chromedriver/downloads

from bs4 import BeautifulSoup
from urllib.request import urlopen
from selenium import webdriver
import urllib
import requests
import openpyxl
import time
import inspect

def CallVariable(varname):
    callers_local_vars = inspect.currentframe().f_back.f_locals.items()
    stringvar = [k for k, v in callers_local_vars if v is varname][0]
    return stringvar


driver = webdriver.Chrome('/Users/ADMIN/Desktop/dev/chromedriver')

# login info
login_domain = {domain} # if need
login_id = {login id} # if need
login_passwd = {login password} # if need


url ={target url} 

driver.get(url)

driver.implicitly_wait(5)

driver.execute_script("document.getElementById('login-popup').style.display = 'block';")

driver.find_element_by_xpath('//*[@id="login-domain"]').send_keys(login_domain)
driver.find_element_by_xpath('//*[@id="login-id"]').send_keys(login_id)

driver.find_element_by_xpath('//*[@id="login-pwd"]').clear()
driver.find_element_by_xpath('//*[@id="login-pwd"]').send_keys(login_passwd)


driver.find_element_by_xpath('//*[@id="login-popup"]/div[2]/form[2]/input[4]').click()

s = requests.Session()

headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
s.headers.update(headers)

for cookie in driver.get_cookies():
    c = {cookie['name'] : cookie['value']}
    s.cookies.update(c)

driver.execute_script("document.getElementById('passwd_keep').style.display = 'none';")
driver.execute_script("document.getElementById('bxslider2_bg').style.display = 'none';")

ez_prd_manage_url = 'http://ca09.ezadmin.co.kr/template25.htm?template='


#Manage Product tab
prd_management = ['C200','C100','CX10','C600','C620','C242','CW00','CF00','CI00','CI10','CI20','CI30','CO00','CO01','CO02']
baseInfo_management =['B100','B120','B121','B200','B203','B400','B700','BP00','BA00','BB00','BC00','BQ00','BD00','BL00','BL10','BN00','BN10','BG02','BJ00','BM00','BM10','BR00']
order_management =['DC10','DF02','DL02','DE00','DS00','DG00','DT10','DR00','DR01','D500','DF00','DZ00','DB00','DB01','DB10','DB11','DM00','DK00','DQ00','DQ01','DQ02','DD00','DD03','DD06','S600','S700']
cs_management = ['E300','EN10','EN00','EN03','EY00','EL00','EL02','E210','E200','E220','EO00','EC00','EC10','E420','EV00','E804','E807','ET00','E809','EP00']
stock_management = ['IG00','I100','IC00','IE00','I210','IA00','IZ00','ID00','IJ00','IU00','IV00','I400','IO00','IM00','IM20','IM30','I300','I310','I330','IY00','S500','S502','S501','I700','I710','IU10','IH00','IH30','IH10','I600','IH60','IT00','IT10','IT40','IT20','IT30','IP00','IP10']
settle_management = ['F304','FX00','F308','F316','FV30','FL10','FL30','FL40','FP00','F200','FV10','FV20','FH00','FI00','FH10','FG00','F500','FB00','FS00','F100','FW00','FW20','FW10']

cs_url_local ="C:/Users/ADMIN/Desktop/C_S_local.html"
cs_url_popup = 'http://ca09.ezadmin.co.kr/popup25.htm?template=E900&_si=ltfogt3avms5bfdlv7pqdkj1m7'
cs_return_req_url ="http://ca09.ezadmin.co.kr/popup25.htm?template=EX00"

# parse target list 
params_list = [prd_management,baseInfo_management,order_management,cs_management,stock_management,settle_management]
popups_list = [cs_url_popup,cs_return_req_url]



for param_list in params_list:
    wb = openpyxl.Workbook()
    for param in param_list:
        response = s.get(ez_prd_manage_url+param).text
        soup = BeautifulSoup(response,features='html.parser')
        for script in soup(["script", "style"]): # kill all script and style elements
            script.extract()    # rip it out
        pure_text = soup.get_text()
        
        # break into lines and remove leading and trailing space on each
        lines = (line.strip() for line in pure_text.splitlines())

        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        # drop blank lines
        final_text = '\n'.join(chunk for chunk in chunks if chunk)

        line_break = final_text.split('\n')
        i = 2

        #create excel sheet
        ws = wb.create_sheet(title=param, index=0)
        ws['A1'] = 'KR'
        for input in line_break:
            if(input == '-'):
                continue
            ws['A'+str(i)] = input
            i+=1

    #save excel 
    wb.save('C:/Users/ADMIN/Desktop/dev/ezadmin-trans/ez-admin-KR_'+CallVariable(param_list)+'.xlsx')


#in case of popup html
for popup_list in popups_list:
    wb = openpyxl.Workbook()
    response = s.get(popup_list).text
    soup = BeautifulSoup(response,features='html.parser')
    for script in soup(["script", "style"]): # kill all script and style elements
        script.extract()    # rip it out
    pure_text = soup.get_text()
    
    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in pure_text.splitlines())

    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    final_text = '\n'.join(chunk for chunk in chunks if chunk)

    line_break = final_text.split('\n')
    i = 2

    #create excel sheet
    ws = wb.create_sheet(title=CallVariable(popup_list), index=0)
    ws['A1'] = 'KR'
    for input in line_break:
        if(input == '-'):
            continue
        ws['A'+str(i)] = input
        i+=1

    #save excel 
    wb.save('C:/Users/ADMIN/Desktop/dev/{path}')



#in case of crawling local html
# when it's not working in network, u can download the html file and crawling in local environement

# wb = openpyxl.Workbook()
# f = open(cs_url,"r",encoding="utf8")
# soup = BeautifulSoup(f,features='html.parser')
# for script in soup(["script", "style"]): # kill all script and style elements
#     script.extract()    # rip it out
# pure_text = soup.get_text()

# # break into lines and remove leading and trailing space on each
# lines = (line.strip() for line in pure_text.splitlines())

# chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
# # drop blank lines
# final_text = '\n'.join(chunk for chunk in chunks if chunk)

# line_break = final_text.split('\n')
# i = 2

# #create excel sheet
# ws = wb.create_sheet(title=CallVariable(cs_url), index=0)
# ws['A1'] = 'KR'
# for input in line_break:
#     if(input == '-'):
#         continue
#     ws['A'+str(i)] = input
#     i+=1

# #save excel 
# wb.save('C:/Users/ADMIN/Desktop/dev/{path})

Tags: crawling python

Share

You may also like...

답글 남기기 응답 취소