멜론 크롤링

오준돌 2023. 10. 13. 20:53

멜론 사용자의 플레이 리스트를 크롤링 해보았다.

플레이 리스트는 사용자들의 노래방리스트들의 정보가 담겨있고 약 2000여개의 리스트들과 각 리스트들에 대한 노래정보들을 가져온다. 멜론에서는 따로 api를 발급해주지 않기 때문에 셀레니움을 이용하여 크롤링 할 예정이다.

크롤링 단계는 셀레니움으로 사용자들의 플레이리스트 id값들을 얻어온 후 request를 통해서 다시 한번 리스트 안에 있는 노래정보들을 가저올 것이다.

1. 우선 셀레니움에 필요한 라이브러리들을 임포트 해준다.

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import random
import pandas as pd
import re

2. 리스트 id 크롤링

페이지에는 10개의 리스트들이 보여지고 xpath를 이용하여 버튼의 규칙을 찾는다. 1-10까지 페이지 버튼이 있는 구조였다. button_index를 하나씩 추가하면서 크롤링 될 수 있게 만들었다.

이 후 첫번째 페이지부터 마지막 페이지까지 반복문을 통해서 리스트들의 아이디값들을 얻기 위해서 마지막 페이지에서 정보가 없으면 break하는 식으로 진행하였다. 하지만 잘 먹히지 않아 마지막 페이지의 다음 페이지가 없으면 한번 더 마지막페이지를 크롤링한뒤에 break하고 duplicate를 사용하여 최종 데이터를 산출해야했다.

페이지 전처리는 beautifulsoup을 사용하여 추출한 뒤 정규표현식을 사용하였다.

from selenium.common.exceptions import NoSuchElementException

driver = webdriver.Chrome(service=service)

URL = "https://www.melon.com/search/dj/index.htm?q=%EB%85%B8%EB%9E%98%EB%B0%A9&section=&searchGnbYn=Y&kkoSpl=N&kkoDpType=#params%5Bq%5D=%25EB%2585%25B8%25EB%259E%2598%25EB%25B0%25A9&params%5Bsort%5D=weight&params%5Bsection%5D=all&po=pageObj&startIndex=1"
driver.get(URL)
time.sleep(4)

next_base_xpath = '//*[@id="pageObjNavgation"]/div/span/a[{}]'
last_page_xpath = '//*[@id="pageObjNavgation"]/div/a[3]'

button_index = 1
first_iteration = True

data = []
while True:
    if first_iteration:
        html_source = driver.page_source
        soup_source = BeautifulSoup(html_source, 'html.parser')
        a_tag = soup_source.find_all('a',class_="wrap_thumb")
        
        if not a_tag:  # 데이터가 없는 경우 루프를 종료합니다.
            break
        
        for i in a_tag:
            id_pattern = re.compile(r'(\d{9})')
            title_pattern = re.compile(r'^(.+?) - 페이지 이동', re.MULTILINE)
            id_found = id_pattern.findall(i['href'])
            title_found = title_pattern.findall(i['title'])
            video_id = id_found[0]
            video_title = title_found[0]
            
            data.append({'Video_ID': video_id, 'Video_Title': video_title})
        
    else:
        try:
            next_button_xpath = next_base_xpath.format(button_index)
            next_button = driver.find_element_by_xpath(next_button_xpath)
            soup_source = BeautifulSoup(driver.page_source, 'html.parser')
            next_button_soup = soup_source.find('a', class_='btn_next')

            a_tag = soup_source.find_all('a',class_="wrap_thumb")
            
            if 'disabled' in next_button_soup['class']:
                break
            
            for i in a_tag:
                id_pattern = re.compile(r'(\d{9})')
                title_pattern = re.compile(r'^(.+?) - 페이지 이동', re.MULTILINE)
                id_found = id_pattern.findall(i['href'])
                title_found = title_pattern.findall(i['title'])
                video_id = id_found[0]
                video_title = title_found[0]
            
                data.append({'Video_ID': video_id, 'Video_Title': video_title})

            next_button.click()
            time.sleep(4)

        except NoSuchElementException:
            # 클릭할 다음 버튼이 없으면 마지막 페이지 버튼을 찾아서 클릭
            try:
                last_page_button = driver.find_element_by_xpath(last_page_xpath)
                last_page_button.click()
                time.sleep(4)
                
                html_source = driver.page_source
                soup_source = BeautifulSoup(html_source, 'html.parser')
                a_tag = soup_source.find_all('a',class_="wrap_thumb")
                df = pd.DataFrame(columns=['Video_ID', 'Video_Title'])
                for i in a_tag:
                    id_pattern = re.compile(r'(\d{9})')
                    title_pattern = re.compile(r'^(.+?) - 페이지 이동', re.MULTILINE)
                    id_found = id_pattern.findall(i['href'])
                    title_found = title_pattern.findall(i['title'])
                    video_id = id_found[0]
                    video_title = title_found[0]
                    
                    data.append({'Video_ID': video_id, 'Video_Title': video_title})
         
            except NoSuchElementException:
                # 마지막 페이지 버튼이 없으면 break
                break

    if button_index >= 10:
        button_index = 1
        first_iteration = False
    else:
        button_index += 1
df = pd.DataFrame(data,columns=['Video_ID','Video_Title'])

크롤링 결과

3. 이제 request로 url에 id값들을 넣으면서 해당 사용자 리스트에 대한 노래정보들을 가져와야 한다.

여기서 문제점은 해당 리스트가 삭제된 경우 페이지가 남아있어서 이 부분에 대한 예외처리를 해야만 했다.

데이터 형태는 title - artist 로 가져오고 저장은 dictionary형태로 저장하였다.

import tqdm
import time
from bs4 import BeautifulSoup
import pandas as pd

dict = {}
for i in tqdm.tqdm(list_num):
    url = f"https://www.melon.com/mymusic/dj/mymusicdjplaylistview_inform.htm?plylstSeq={i}"

    driver.get(url)
    time.sleep(3)

    html_source = driver.page_source
    soup_source = BeautifulSoup(html_source, 'html.parser')
    a_tag = soup_source.find_all('a', class_="wrap_thumb")

    titles = soup_source.find_all("div", class_="ellipsis rank01")
    singers = soup_source.find_all("div", class_="ellipsis rank02")

    # 데이터가 없으면 다음 순서로 넘어가게 하기
    try:
        if not titles or not singers:
            raise ValueError("Data not found")

        song = [i.find("a").text for i in titles]
        name = [i.find("a").text for i in singers]

        ddd = pd.DataFrame()
        ddd['title'] = song
        ddd['artist'] = name

        alist = []
        for index, row in ddd.iterrows():
            a_value = row['title']
            b_value = row['artist']
            combined_str = f"{a_value}-{b_value}"
            alist.append(combined_str)

        dict[soup_source.find("div", class_="ellipsis song_name").text.strip()] = alist

    except ValueError as e:
        print(f"Skipping index {i} due to error: {e}")
        continue

결과 값

※ 피클파일로 저장한 이유는 용량이 적고 언제든지 후에 모델링을 할 때 필요한 데이터 형태로 자유자재로 전처리를 할 수 있도록 하기 위하여 딕셔너리 형태로 저장하였다.