library(RSelenium) library(wdman) library(rvest) library(plyr) library(xml2) library(jsonlite) setwd("D:/Talks/Scraping Workshop") ### WIKIART ################### ## TO GET LIST OF ARTWORKS ## page <- read_html("https://www.wikiart.org/en/jacques-louis-david/all-works/text-list") ## Identify CSS Selector ## TO GET ONE ARTWORK NAME AND WEBLINK ## art <- page %>% html_nodes("li.painting-list-text-row:nth-child(9) > a:nth-child(1)") %>% html_text() art link <- page %>% html_nodes("li.painting-list-text-row:nth-child(9) > a:nth-child(1)") %>% html_attr("href") link ## We want all the artworks ## TO GET ALL ARTWORK NAMES AND WEBLINKS IN ONE GO ## art <- page %>% html_nodes("li.painting-list-text-row > a:nth-child(1)") %>% html_text() art <- as.data.frame(art) link <- page %>% html_nodes("li.painting-list-text-row > a:nth-child(1)") %>% html_attr("href") art$link <- link art$link <- paste0("https://www.wikiart.org",art$link) ## TO GET ARTWORK SPECIFIC DATA ###################### page <- read_html(art[1,2]) title <- page %>% html_nodes(".wiki-layout-artist-info > article:nth-child(3) > h3:nth-child(1)") %>% html_text() artist <- page %>% html_nodes(".wiki-layout-artist-info > article:nth-child(3) > h5:nth-child(2) > span:nth-child(1) > a:nth-child(1)") %>% html_text() origin <- page %>% html_nodes(".wiki-layout-artist-info > article:nth-child(3) > ul:nth-child(3) > li:nth-child(2) > span:nth-child(2)") %>% html_text() style <- page %>% html_nodes("li.dictionary-values:nth-child(3) > span:nth-child(2) > a:nth-child(1)") %>% html_text() genre <- page %>% html_nodes("li.dictionary-values:nth-child(4) > span:nth-child(2) > a:nth-child(1) > span:nth-child(1)") %>% html_text() media <- page %>% html_nodes("li.dictionary-values:nth-child(5) > span:nth-child(2)") %>% html_text() location <- page %>% html_nodes(".dictionary-values-gallery > span:nth-child(2)") %>% html_text() img.url <- page %>% html_nodes(".wiki-layout-artist-image-wrapper > img:nth-child(3)") %>% html_attr("src") download.file(img.url, "art.jpg", mode = "wb")