1

I have been trying to scrape a website for tables with Rvest. Currently I can get to a table, but it only pulls the headers of the table but not the content. I haven't figured out how to complete resolve this, but I've tried a few similar solutions. (How to get table using rvest())

I have already figured out how to submit to a single button without appropriate unique identifiers. Done through: How to submit login form in Rvest package w/o button argument, https://github.com/hadley/rvest/issues/156

Here's my operating script:

library(rvest)
library(httr)
library(R.utils)
    url<-'https://itmdapps.milwaukee.gov/publicApplication_QD/zipcode.jsp'
##############################
#fix for package available online. https://github.com/hadley/rvest/issues/156
custom.submit_request <-
  function (form, submit = NULL)
  {
    submits <- Filter(function(x) {
      identical(tolower(x$type), "submit")
    }, form$fields)

    nsubmits <- Filter(function(x) {
      !identical(tolower(x$type), "submit")
    }, form$fields)

    # if list take name and vakue as inputs
    if (is.list(submit)) {
      submits[[1]]$name  <- names(submit)[1]
      submits[[1]]$value <- submit[[1]]
      submit <- submits[[1]]
    }

    # if character filter by name
    if (is.character(submit)){
      submit <- Filter(function(x){x$name==submit},submits)[[1]]
    }

    # if null choose first
    if (is.null(submit)) {
      submit <- submits[[1]]
      message("Submitting with '", submit$name, "'")
    }

    # handle method
    method <- form$method
    if (!(method %in% c("POST", "GET"))) {
      warning("Invalid method (", method, "), defaulting to GET",
              call. = FALSE)
      method <- "GET"
    }

    # url
    url <- form$url

    # fields
    fields <- nsubmits
    fields[submit$name] <- list(submit)
    fields <- Filter(function(x) length(x$value) > 0, fields)
    values <- rvest::pluck(fields, "value")
    names(values) <- names(fields)

    # return
    list(
      method = method, 
      encode = form$enctype, 
      url = url,
      values = values
    )
  }
reassignInPackage('submit_request', 'rvest', custom.submit_request)
#####################################

target_zip_code_position<-2
webpage.session <- html_session(url) #start website
form<-html_form(webpage.session) #here's the form.

#log in!
form #let's look at it. We have to log in!
filled_form<-form #create a copy to fill so we don't ruin the original.
filled_form[[2]]<-set_values(filled_form[[2]], 
                             username = "address",
                             password = "user") #fill forms
filled_form #how does it look?
#filled_form[[2]]$url<-""  #URL needs to be cleared to prevent error message when submitting.
logged_in.session<-submit_form(session = webpage.session,
                               form = filled_form[[2]]) # defaults to first submission button with message.
#We have successfully logged in.
zip_search.session<-jump_to(logged_in.session,url) #navigate to the page with the query we want.
zip_search.form<-html_form(zip_search.session)
zip_search.form_filled<-zip_search.form
zip_search.form_filled[[2]]<-set_values(zip_search.form_filled[[2]],
                                       zipcode = target_zip_code_position,
                                       format = 1,
                                       startDate = "01/01/2005",
                                       endDate = "01/01/2006"
                                       )
list_submit<-list('WIBR Detailed')
names(list_submit)<-c('submit')  #Very bizzare submit approach.
output.session<- submit_form(session = zip_search.session,
                            form = zip_search.form_filled[[2]],
                            submit = list_submit
                            ) #how does it know which one? Requires fancy submit technique here. Now works.
### we have sent a query
#output.read_html<-read_html(output.session)
#output_table<-html_table(output.session, fill = TRUE)[[1]]  #no rows.

table_node<-html_node(output.session,'div.main div.content:nth-child(5) table.bordered:nth-child(1)')
html_text(table_node) #no rows still. Only selects proper table.
html_table(table_node)
RegressForward
  • 249
  • 1
  • 13

0 Answers0