selenium¶

인터파크 투어 사이트에서 여행지 입력 후 검색 -> 잠시 후 -> 결과
로그인 시 pc 웹 사이트에서 처리가 어려울 경우 -> 모바일 로그인 진입
모듈 가져오기

%%html
 
<!-- 에디터 폰트를 조정합니다. -->
<style type='text/css'>
.CodeMirror{
    font-size: 14px;
    font-family: consolas;
</style>

# pip install selenium

모듈 가져오기¶

from selenium import webdriver as wd

사전에 필요한 정보 로드¶

디비혹스 쉘, 배치 파일에서 인자로 받아서 세팅

main_url = 'http://tour.interpark.com/'
keyword = '로마'

드라이버 로드¶

차후에 옵션을 부여하여 (프록시, 에이전트 조작, 이미지 배제)
크롤링을 오래 돌리면 임시파일들이 쌓임 - 템포 파일 삭제

# 드라이버 로드
driver = wd.Chrome(executable_path='chromedriver.exe')

사이트 접속 (get 방식)¶

driver.get(main_url)

검색창을 찾아서 검색어 입력¶

# id : SearchGNBText
driver.find_element_by_id('SearchGNBText').send_keys(keyword) # send_keys 웹 페이지에 입력해라

# send_keys에서 수정할 경우, 뒤에 내용이 붙어버림 -> .clear()로 선 조치 후 send_keys('내용')다시 입력

검색버튼 클릭¶

driver.find_element_by_css_selector('.search-btn').click()

# 검색 후, 더보기를 눌러야 전체 해외여행 패키지 상품에 진입 가능
# 검색 결과 리스트가 페이징되어 나타남

잠시 대기¶

페이지가 로드되고 나서 즉각적으로 데이터를 획득하는 행위는 자제
검색 결과가 다 불러져 화면 구성할 때까지 대기해줘야 함

명시적 대기 : 특정 요소가 로케이트(발견될때까지) 대기
암묵적 대기 : DOM이 다 로드 될때까지 대기하고 먼저 로드되면 바로 진행
절대적 대기 : time.sleep(10) / 클라우드 페어(디도스 방어 솔루션)

from selenium.webdriver.common.by import By
# 명시적 대기를 위해 
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import time

# 명시적 대기
try:
    element = WebDriverWait(driver, 10).until(
        # 지정한 한개 요소가 올라오면 웨이트 종료
        EC.presence_of_element_located( (By.CLASS_NAME, 'oTravelBox') )
    ) # 첫번째 인자 driver, 10초 대기
except Exception as e:
    print(' 오류 발생 ', e)

# 암묵적 대기
# 요소를 찾을 특정 시간 동안 DOM 풀링을 지시, 예를 들어 10초 이내라고 발견되면 진행
driver.implicitly_wait( 10 )

검색 후, '더보기' 눌러서 게시판 진입¶

driver.find_element_by_css_selector('.oTravelBox>.boxList>.moreBtnWrap>.moreBtn').click()

# 게시판에서 데이터를 가져올 때, 데이터가 많은면 세션(혹시 로그인을 해서 접근할 수 있는 사이트일 경우) 관리
# 특정 단위별로 로그아웃 로그인 계속 시도
# 특정 게시물이 사라질 경우, 팝업 발생(없는 ~ 라고 나옴) ->so, 팝업 처리 방법 검토 필요
# 게시판을 스캔 할 때, 임계점을 모른다는 문제가 있다.

# 게시판을 스캔 -> 메타 정보 획득 -> loop 돌려서 일괄적으로 방문 접근 처리

페이징 처리 및 메타데이터 가져오기¶

+ 상품 정보 담는 리스트 생성 및 정보 담아주기¶

상품 정보를 담아주는 클래스 생성¶

class TourInfo:
    # 각각의 정보 멤버변수에 정의 (실제 컬럼보다는 적게 셋팅함)
    title = ''
    price = ''
    area = ''
    link = ''
    img = ''
    # 생성자 (생성자에서는 멤버변수 초기화해주기)
    def __init__(self, title, price, area, link, img):
        self.title = title
        self.price = price
        self.area = area
        self.link = link
        self.img = img        
        
# 주피터에서 class 쓰고 싶으면 쓰고 싶은 부분 위에서 정의 후 사용

# 페이징되어있는 게시판 (자바스크립트 구동)
# searchModule.SetCategoryList(1, '') 스크립트 실행

# 상품 정보 담는 리스트 생성(TourInfo 리스트)
tour_list = []

for page in range(1, 2): #17): #1-16 / 17은 임시값, 게시물을 넘어갔을 때 현상 확인용
    try:
        # 자바스크립트 구동하기
        driver.execute_script("searchModule.SetCategoryList(%s, '')" % page)
        time.sleep(2) # 2초마다 페이지 넘어가게
        ####################
        
        # 여러 사이트에서 정보를 수집할 경우, 공통 정보 정의 단계 필요
        # 상품명, 코멘트, 기간1, 기간2, 가격, 평점, 썸네일, 링크(상품상세정보)
        boxItems = driver.find_elements_by_css_selector('.oTravelBox>.boxList>li') # 하위 하위의 li들만
        
        # 상품 하나하나 접근
        for li in boxItems:
            print( '상품명', li.find_element_by_css_selector('h5.proTit').text)
            print( '코멘트', li.find_element_by_css_selector('p.proSub').text)
            print( '기간1', li.find_element_by_css_selector('p.proInfo').text)
            print( '기간2', li.find_element_by_css_selector('p.proInfo:nth-child(2)').text)
            print( '가격', li.find_element_by_css_selector('strong.proPrice').text)
            print( '평점', li.find_element_by_css_selector('.info-row div:nth-child(2) p.proInfo').text)
#             for info in li.find_elements_by_css_selector('.info-row .proInfo'):
#                 print( info.text )

            # find (요소).get_attribute(속성명)사용해서 요소의 속성 값 가져올 수 있다
            print( '썸네일', li.find_element_by_css_selector('img').get_attribute('src'))
            print( '링크', li.find_element_by_css_selector('a').get_attribute('onclick'))
            
            # 이미지를 링크값을 사용할 것인가 
            # or 직접 다운로드 해서 우리 서버에 업로드(ftp)할 것인가 고민
            
            print('----------------------------------------------------------------')
            
            # 데이터 모음
            # 데이터가 부족하거나 없을수도 있으므로 직접 인덱스로 표현하는 것은 위험성이 있음
            obj = TourInfo(
                li.find_element_by_css_selector('h5.proTit').text,
                li.find_element_by_css_selector('strong.proPrice').text,
                li.find_element_by_css_selector('.info-row div:nth-child(2) p.proInfo').text,
                li.find_element_by_css_selector('a').get_attribute('onclick'),
                li.find_element_by_css_selector('img').get_attribute('src'),
            )
            
            tour_list.append( obj )
            
    except Exception as e1:
        print('오류', e1)

print(tour_list, len(tour_list))

상품명 [4국10일_대한항공]1급_융프라우_런던&로마 야간투어
코멘트 #베르사유 궁전 내부#바티칸하이패스
기간1 여행 기간 : 8박10일
기간2 출발 가능 기간 : 2020.07.02~2021.03.28
가격 2,256,400 원~
평점 평점 9.7
썸네일 http://tourimage.interpark.com/product/tour/00161/A30/280/A3015008_14_870.jpg
링크 searchModule.OnClickDetail('http://tour.interpark.com/goods/detail/?BaseGoodsCd=A3015008','')
----------------------------------------------------------------
상품명 [로마6일] 바티칸투어무료/로마자유여행
코멘트 Bon Appetit!
기간1 여행 기간 : 4박6일
기간2 출발 가능 기간 : 2020.07.01~2021.01.31
가격 1,180,000 원~
평점 평점 9.7
썸네일 http://tourimage.interpark.com/product/tour/00161/B30/280/B3011455_9_663.jpg
링크 searchModule.OnClickDetail('http://tour.interpark.com/goods/detail/?BaseGoodsCd=B3011455','')
----------------------------------------------------------------
상품명 [이탈리아7일] 바티칸투어무료/로마+피렌체+베니스 자유여행
코멘트 이탈리아 BASIC
기간1 여행 기간 : 5박7일
기간2 출발 가능 기간 : 2020.07.01~2020.07.31
가격 1,877,000 원~
평점 평점 9.6
썸네일 http://tourimage.interpark.com/product/tour/00161/B30/280/B3011059_3_300.jpg
링크 searchModule.OnClickDetail('http://tour.interpark.com/goods/detail/?BaseGoodsCd=B3011059','')
----------------------------------------------------------------
상품명 [로마/피렌체/베니스 8일] 이탈리아일주+왕복직항_아시아나항공
코멘트 아시아나 직항 / 사전발권특가
기간1 여행 기간 : 6박8일
기간2 출발 가능 기간 : 2020.07.02~2020.10.31
가격 1,724,200 원~
평점 평점 9.8
썸네일 http://tourimage.interpark.com/product/tour/00161/B30/280/B3011892_1_267.jpg
링크 searchModule.OnClickDetail('http://tour.interpark.com/goods/detail/?BaseGoodsCd=B3011892','')
----------------------------------------------------------------
상품명 [서유럽8일] 도시간 이동포함/파리+베네치아+로마 자유여행
코멘트 프랑스/이태리 핵심도시!
기간1 여행 기간 : 6박8일
기간2 출발 가능 기간 : 2020.07.01~2020.10.31
가격 1,836,700 원~
평점 평점 9.3
썸네일 http://tourimage.interpark.com/product/tour/00161/B30/280/B3010988_7_707.jpg
링크 searchModule.OnClickDetail('http://tour.interpark.com/goods/detail/?BaseGoodsCd=B3010988','')
----------------------------------------------------------------
상품명 [유럽 반자유 허니문] 로마/몰타 6박8일_QR항공
코멘트 인기상품
기간1 여행 기간 : 6박8일
기간2 출발 가능 기간 : 2020.08.30~2020.12.13
가격 2,580,000 원~
평점 평점 0.0
썸네일 http://tourimage.interpark.com/product/tour/00161/D30/280/D3010706_1_600.jpg
링크 searchModule.OnClickDetail('http://tour.interpark.com/goods/detail/?BaseGoodsCd=D3010706','')
----------------------------------------------------------------
상품명 [유럽 반자유 허니문] 로마/몰타 5박7일_QR항공
코멘트 인기상품
기간1 여행 기간 : 5박7일
기간2 출발 가능 기간 : 2020.08.30~2020.12.13
가격 2,500,000 원~
평점 평점 0.0
썸네일 http://tourimage.interpark.com/product/tour/00161/D30/280/D3010705_1_980.jpe
링크 searchModule.OnClickDetail('http://tour.interpark.com/goods/detail/?BaseGoodsCd=D3010705','')
----------------------------------------------------------------
상품명 [유럽 반자유 허니문] 로마/프라하 6박8일_남부투어 1일 포함
코멘트 이태리+체코 연계상품
기간1 여행 기간 : 6박8일
기간2 출발 가능 기간 : 2020.08.30~2020.12.13
가격 2,250,000 원~
평점 평점 0.0
썸네일 http://tourimage.interpark.com/product/tour/00161/D30/280/D3010694_1_827.jpg
링크 searchModule.OnClickDetail('http://tour.interpark.com/goods/detail/?BaseGoodsCd=D3010694','')
----------------------------------------------------------------
상품명 [유럽 반자유 허니문] 로마/피렌체/베니스 5박7일
코멘트 이태리일주
기간1 여행 기간 : 5박7일
기간2 출발 가능 기간 : 2020.08.30~2020.12.13
가격 2,060,000 원~
평점 평점 0.0
썸네일 http://tourimage.interpark.com/product/tour/00161/D30/280/D3010683_1_740.jpg
링크 searchModule.OnClickDetail('http://tour.interpark.com/goods/detail/?BaseGoodsCd=D3010683','')
----------------------------------------------------------------
상품명 [유럽 반자유 허니문] 바르셀로나/로마 6박8일
코멘트 유럽/허니문/바르셀로나/로마
기간1 여행 기간 : 6박8일
기간2 출발 가능 기간 : 2020.07.01~2020.12.13
가격 2,090,000 원~
평점 평점 0.0
썸네일 http://tourimage.interpark.com/product/tour/00161/D30/280/D3010682_1_890.jpg
링크 searchModule.OnClickDetail('http://tour.interpark.com/goods/detail/?BaseGoodsCd=D3010682','')
----------------------------------------------------------------
[<__main__.TourInfo object at 0x0000028AC6510D08>, <__main__.TourInfo object at 0x0000028AC61EFB48>, <__main__.TourInfo object at 0x0000028AC64D97C8>, <__main__.TourInfo object at 0x0000028AC60FD7C8>, <__main__.TourInfo object at 0x0000028AC648CB48>, <__main__.TourInfo object at 0x0000028AC64D2C08>, <__main__.TourInfo object at 0x0000028AC4109948>, <__main__.TourInfo object at 0x0000028AC683C908>, <__main__.TourInfo object at 0x0000028AC64D26C8>, <__main__.TourInfo object at 0x0000028AC64C5A08>] 10

수집한 정보 개수를 루프 -> 페이지 방문 -> 콘텐츠 획득(상품상세정보) -> DB¶

for tour in tour_list:
    # tour -> TourInfo
#     print( type(tour) )
    
    # 링크 데이터에서 실데이터 확득
    # 분해
    arr = tour.link.split(',')
#     print(arr)

    if arr:
        # 대체
        link = arr[0].replace('searchModule.OnClickDetail()','')
        print(link)
        # 슬라이싱 -> 앞에 내용과 ', 뒤에 ' 제거
        detail_url = link[28:-1]
        print(detail_url)
        # 상세 페이지 이동 : URL 값이 완성된 형태인지 확인(http~)
        driver.get( detail_url )
        time.sleep(2)

searchModule.OnClickDetail('http://tour.interpark.com/goods/detail/?BaseGoodsCd=A3015008'
http://tour.interpark.com/goods/detail/?BaseGoodsCd=A3015008
searchModule.OnClickDetail('http://tour.interpark.com/goods/detail/?BaseGoodsCd=B3011455'
http://tour.interpark.com/goods/detail/?BaseGoodsCd=B3011455
searchModule.OnClickDetail('http://tour.interpark.com/goods/detail/?BaseGoodsCd=B3011059'
http://tour.interpark.com/goods/detail/?BaseGoodsCd=B3011059
searchModule.OnClickDetail('http://tour.interpark.com/goods/detail/?BaseGoodsCd=B3011892'
http://tour.interpark.com/goods/detail/?BaseGoodsCd=B3011892
searchModule.OnClickDetail('http://tour.interpark.com/goods/detail/?BaseGoodsCd=B3010988'
http://tour.interpark.com/goods/detail/?BaseGoodsCd=B3010988
searchModule.OnClickDetail('http://tour.interpark.com/goods/detail/?BaseGoodsCd=D3010706'
http://tour.interpark.com/goods/detail/?BaseGoodsCd=D3010706
searchModule.OnClickDetail('http://tour.interpark.com/goods/detail/?BaseGoodsCd=D3010705'
http://tour.interpark.com/goods/detail/?BaseGoodsCd=D3010705
searchModule.OnClickDetail('http://tour.interpark.com/goods/detail/?BaseGoodsCd=D3010694'
http://tour.interpark.com/goods/detail/?BaseGoodsCd=D3010694
searchModule.OnClickDetail('http://tour.interpark.com/goods/detail/?BaseGoodsCd=D3010683'
http://tour.interpark.com/goods/detail/?BaseGoodsCd=D3010683
searchModule.OnClickDetail('http://tour.interpark.com/goods/detail/?BaseGoodsCd=D3010682'
http://tour.interpark.com/goods/detail/?BaseGoodsCd=D3010682

Beautiful Soup¶

from bs4 import BeautifulSoup as bs

# 현재 페이지를 beautiful soup 의 dom 으로 구성
soup = bs(driver.page_source, 'html.parser')

# 현재 상세 정보 페이지에서 스케줄 정보 획득
data = soup.select('.schedule-all')

print(type(data))

<class 'bs4.element.ResultSet'>

종료¶

# 종료
driver.close()
driver.quit()
# import sys
# sys.exit()

티스토리

T-아카데미 강의 : Python을 활용한 웹 크롤러 만들기( 하나투어 크롤링)

selenium¶

모듈 가져오기¶

사전에 필요한 정보 로드¶

드라이버 로드¶

사이트 접속 (get 방식)¶

검색창을 찾아서 검색어 입력¶

검색버튼 클릭¶

잠시 대기¶

검색 후, '더보기' 눌러서 게시판 진입¶

페이징 처리 및 메타데이터 가져오기¶

+ 상품 정보 담는 리스트 생성 및 정보 담아주기¶

상품 정보를 담아주는 클래스 생성¶

수집한 정보 개수를 루프 -> 페이지 방문 -> 콘텐츠 획득(상품상세정보) -> DB¶

Beautiful Soup¶

종료¶