Manipulação de XML

Prof. Walmes Marques Zeviani

02 Mar 2017

Objetivo e justificativa

Dados disponíveis em APIs públicas

API WEB

The Open Movie Database

Documentação: http://www.omdbapi.com/.

# Consulta informações sobre o filme Titanic.
url <- "http://www.omdbapi.com/?t=%s&r=xml"
url <- sprintf(url, "titanic")
browseURL(url)

# Faz busca por filmes com o termo batman.
url <- "http://www.omdbapi.com/?s=%s&r=xml"
url <- sprintf(url, "batman")
browseURL(url)

Earthquake Catalog API

# Número de ocorrências para no intervalo fornecido.
url <- paste0("https://earthquake.usgs.gov/fdsnws/event/1/",
              "count?",
              "format=xml&",
              "starttime=2017-02-02T12:00:00&",
              "endtime=2017-02-02T13:00:00")
browseURL(url)

# Detalhes das ocorrências no intervalo fornecido.
url <- paste0("https://earthquake.usgs.gov/fdsnws/event/1/",
              "query?",
              "format=xml&",
              "starttime=2017-02-02T12:00:00&",
              "endtime=2017-02-02T13:00:00")
browseURL(url)

Open Weather Map API

# JSON com informações de Curitiba.
# {"_id": 6322752,
#  "name": "Curitiba",
#  "country": "BR",
#  "coord": {"lon": -49.290821, "lat": -25.50395}}

url <- paste0("http://samples.openweathermap.org/data/2.5/",
              "weather?id=%d&appid=%s")
url <- sprintf(url,
               6322752,
               "6eb4d970f22e507866028152781d8dd5")
url

browseURL(url)

Mais APIs

Atenção

Exame e Operação de XML

Dados colhidos da API

# Faz busca por filmes com o termo batman.
url <- "http://www.omdbapi.com/?s=%s&r=xml"
# url <- sprintf(url, "rocky")
# url <- sprintf(url, "terminator")
# url <- sprintf(url, "rambo")
url <- sprintf(url, "resident evil")
url
## [1] "http://www.omdbapi.com/?s=resident evil&r=xml"
# browseURL(url)

O documento XML

library(XML)
doc <- xmlParse(url)

# Classe do objeto.
class(doc)
## [1] "XMLInternalDocument" "XMLAbstractDocument"
# Métodos disponíveis para a classe.
methods(class = "XMLInternalDocument")
##  [1] addChildren             coerce                 
##  [3] docName<-               docName                
##  [5] free                    getEncoding            
##  [7] initialize              print                  
##  [9] processXInclude         readKeyValueDB         
## [11] readSolrDoc             saveXML                
## [13] show                    slotsFromS3            
## [15] summary                 xmlChildren            
## [17] xmlClone                [                      
## [19] [[                      xmlNamespaceDefinitions
## [21] xmlRoot                 xmlSourceFunctions     
## [23] xmlSourceSection        xmlSource              
## [25] xmlToDataFrame          xpathApply             
## see '?methods' for accessing help and source code
# Nome do arquivo origem.
docName(doc)
## [1] "http://www.omdbapi.com/?s=resident%20evil&r=xml"
# Mostra o conteúdo.
show(doc)
## <?xml version="1.0" encoding="utf-8"?>
## <root totalResults="67" response="True">
##   <result title="Resident Evil" year="2002" imdbID="tt0120804" type="movie" poster="https://images-na.ssl-images-amazon.com/images/M/MV5BN2Y2MTljNjMtMDRlNi00ZWNhLThmMWItYTlmZjYyZDk4NzYxXkEyXkFqcGdeQXVyNjQ2MjQ5NzM@._V1_SX300.jpg"/>
##   <result title="Resident Evil: Apocalypse" year="2004" imdbID="tt0318627" type="movie" poster="https://images-na.ssl-images-amazon.com/images/M/MV5BMTc1NTUxMzk0Nl5BMl5BanBnXkFtZTcwNDQ1MDIzMw@@._V1_SX300.jpg"/>
##   <result title="Resident Evil: Extinction" year="2007" imdbID="tt0432021" type="movie" poster="https://images-na.ssl-images-amazon.com/images/M/MV5BNDQ0MWI3MmEtMzM0OC00Y2ViLWE4MDItMzNhNmY1ZTdjMWE2XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_SX300.jpg"/>
##   <result title="Resident Evil: Afterlife" year="2010" imdbID="tt1220634" type="movie" poster="https://images-na.ssl-images-amazon.com/images/M/MV5BMTkxNzM3OTg5OF5BMl5BanBnXkFtZTcwMDA5MDA2Mw@@._V1_SX300.jpg"/>
##   <result title="Resident Evil: Retribution" year="2012" imdbID="tt1855325" type="movie" poster="https://images-na.ssl-images-amazon.com/images/M/MV5BMTA2NTkwNjUxNTZeQTJeQWpwZ15BbWU3MDE2OTMxMTg@._V1_SX300.jpg"/>
##   <result title="Resident Evil: The Final Chapter" year="2016" imdbID="tt2592614" type="movie" poster="https://images-na.ssl-images-amazon.com/images/M/MV5BMTc0Mzc2OTQ0Ml5BMl5BanBnXkFtZTgwOTQ5MjE4MDI@._V1_SX300.jpg"/>
##   <result title="Resident Evil: Degeneration" year="2008" imdbID="tt1174954" type="movie" poster="https://images-na.ssl-images-amazon.com/images/M/MV5BMTgwMDE4NzcxMl5BMl5BanBnXkFtZTcwNjU1NjcwMg@@._V1_SX300.jpg"/>
##   <result title="Resident Evil: Damnation" year="2012" imdbID="tt1753496" type="movie" poster="https://images-na.ssl-images-amazon.com/images/M/MV5BNzg2ODUxOTIzN15BMl5BanBnXkFtZTcwMDM0NzAzOA@@._V1_SX300.jpg"/>
##   <result title="Resident Evil 4" year="2005" imdbID="tt0397042" type="game" poster="http://ia.media-imdb.com/images/M/MV5BMTU1NjY2MDM4Nl5BMl5BanBnXkFtZTgwMzk0MTU3MDE@._V1_SX300.jpg"/>
##   <result title="Resident Evil 5" year="2009" imdbID="tt0473298" type="game" poster="http://ia.media-imdb.com/images/M/MV5BMTk3MDMzNzMxMV5BMl5BanBnXkFtZTcwNDYxNTMxNg@@._V1_SX300.jpg"/>
## </root>
## 

Propriedades do elemento root

root <- xmlRoot(doc)

# Classe do objeto.
class(root)
## [1] "XMLInternalElementNode" "XMLInternalNode"       
## [3] "XMLAbstractNode"
# Métodos disponíveis para a classe.
methods(class = "XMLInternalElementNode")
##  [1] addAttributes       coerce              docName            
##  [4] getEncoding         initialize          processXInclude    
##  [7] readHTMLList        readHTMLTable       readKeyValueDB     
## [10] readSolrDoc         removeAttributes    removeXMLNamespaces
## [13] saveXML             show                slotsFromS3        
## [16] xmlAttrs<-          xmlChildren<-       xmlClone           
## [19] [[                  xmlNamespaces<-     xmlNamespace<-     
## [22] xmlName<-           xmlParent           xmlToDataFrame     
## [25] xmlToS4             xmlValue<-         
## see '?methods' for accessing help and source code
# Mostra o conteúdo.
show(root)
## <root totalResults="67" response="True">
##   <result title="Resident Evil" year="2002" imdbID="tt0120804" type="movie" poster="https://images-na.ssl-images-amazon.com/images/M/MV5BN2Y2MTljNjMtMDRlNi00ZWNhLThmMWItYTlmZjYyZDk4NzYxXkEyXkFqcGdeQXVyNjQ2MjQ5NzM@._V1_SX300.jpg"/>
##   <result title="Resident Evil: Apocalypse" year="2004" imdbID="tt0318627" type="movie" poster="https://images-na.ssl-images-amazon.com/images/M/MV5BMTc1NTUxMzk0Nl5BMl5BanBnXkFtZTcwNDQ1MDIzMw@@._V1_SX300.jpg"/>
##   <result title="Resident Evil: Extinction" year="2007" imdbID="tt0432021" type="movie" poster="https://images-na.ssl-images-amazon.com/images/M/MV5BNDQ0MWI3MmEtMzM0OC00Y2ViLWE4MDItMzNhNmY1ZTdjMWE2XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_SX300.jpg"/>
##   <result title="Resident Evil: Afterlife" year="2010" imdbID="tt1220634" type="movie" poster="https://images-na.ssl-images-amazon.com/images/M/MV5BMTkxNzM3OTg5OF5BMl5BanBnXkFtZTcwMDA5MDA2Mw@@._V1_SX300.jpg"/>
##   <result title="Resident Evil: Retribution" year="2012" imdbID="tt1855325" type="movie" poster="https://images-na.ssl-images-amazon.com/images/M/MV5BMTA2NTkwNjUxNTZeQTJeQWpwZ15BbWU3MDE2OTMxMTg@._V1_SX300.jpg"/>
##   <result title="Resident Evil: The Final Chapter" year="2016" imdbID="tt2592614" type="movie" poster="https://images-na.ssl-images-amazon.com/images/M/MV5BMTc0Mzc2OTQ0Ml5BMl5BanBnXkFtZTgwOTQ5MjE4MDI@._V1_SX300.jpg"/>
##   <result title="Resident Evil: Degeneration" year="2008" imdbID="tt1174954" type="movie" poster="https://images-na.ssl-images-amazon.com/images/M/MV5BMTgwMDE4NzcxMl5BMl5BanBnXkFtZTcwNjU1NjcwMg@@._V1_SX300.jpg"/>
##   <result title="Resident Evil: Damnation" year="2012" imdbID="tt1753496" type="movie" poster="https://images-na.ssl-images-amazon.com/images/M/MV5BNzg2ODUxOTIzN15BMl5BanBnXkFtZTcwMDM0NzAzOA@@._V1_SX300.jpg"/>
##   <result title="Resident Evil 4" year="2005" imdbID="tt0397042" type="game" poster="http://ia.media-imdb.com/images/M/MV5BMTU1NjY2MDM4Nl5BMl5BanBnXkFtZTgwMzk0MTU3MDE@._V1_SX300.jpg"/>
##   <result title="Resident Evil 5" year="2009" imdbID="tt0473298" type="game" poster="http://ia.media-imdb.com/images/M/MV5BMTk3MDMzNzMxMV5BMl5BanBnXkFtZTcwNDYxNTMxNg@@._V1_SX300.jpg"/>
## </root>
# Nome do elemento raíz.
xmlName(root)
## [1] "root"
# Quantos filhos (child) possui.
xmlSize(root)
## [1] 10
# Atributos do elemento raíz.
xmlAttrs(root)
## totalResults     response 
##         "67"       "True"

Seleção

kiva <- "http://api.kivaws.org/v1/teams/2/lenders.xml"
doc <- xmlParse(kiva)
root <- xmlRoot(doc)

xmlSize(root)
## [1] 2
xmlName(root)
## [1] "response"
# Seleciona o primeiro filho. Qual a diferença?
root[1]
## $paging
## <paging>
##   <page>1</page>
##   <total>68</total>
##   <page_size>50</page_size>
##   <pages>2</pages>
## </paging> 
## 
## attr(,"class")
## [1] "XMLInternalNodeList" "XMLNodeList"
root[[1]]
## <paging>
##   <page>1</page>
##   <total>68</total>
##   <page_size>50</page_size>
##   <pages>2</pages>
## </paging>
# Extraindo o conteúdo do elemento.
xmlValue(root[[1]])
## [1] "168502"
root[[1]]["total"]
## $total
## <total>68</total> 
## 
## attr(,"class")
## [1] "XMLInternalNodeList" "XMLNodeList"
root[[1]][["total"]]
## <total>68</total>
xmlValue(root[[1]][["total"]])
## [1] "68"

Atributos

url <- "http://www.omdbapi.com/?s=%s&r=xml"
url <- sprintf(url, "resident evil")
url
## [1] "http://www.omdbapi.com/?s=resident evil&r=xml"
# browseURL(url)

doc <- xmlParse(url)
docName(doc)
## [1] "http://www.omdbapi.com/?s=resident%20evil&r=xml"
root <- xmlRoot(doc)
xmlName(root)
## [1] "root"
xmlSize(root)
## [1] 10
# Pega os atributos do elemento raíz.
xmlAttrs(root)
## totalResults     response 
##         "67"       "True"
# Nome dos filhos.
names(root)
##   result   result   result   result   result   result   result 
## "result" "result" "result" "result" "result" "result" "result" 
##   result   result   result 
## "result" "result" "result"
# Seleciona o primeiro filho.
root[[1]]
## <result title="Resident Evil" year="2002" imdbID="tt0120804" type="movie" poster="https://images-na.ssl-images-amazon.com/images/M/MV5BN2Y2MTljNjMtMDRlNi00ZWNhLThmMWItYTlmZjYyZDk4NzYxXkEyXkFqcGdeQXVyNjQ2MjQ5NzM@._V1_SX300.jpg"/>
# Extrai todos os atributos.
xmlAttrs(root[[1]])
##                                                                                                                                             title 
##                                                                                                                                   "Resident Evil" 
##                                                                                                                                              year 
##                                                                                                                                            "2002" 
##                                                                                                                                            imdbID 
##                                                                                                                                       "tt0120804" 
##                                                                                                                                              type 
##                                                                                                                                           "movie" 
##                                                                                                                                            poster 
## "https://images-na.ssl-images-amazon.com/images/M/MV5BN2Y2MTljNjMtMDRlNi00ZWNhLThmMWItYTlmZjYyZDk4NzYxXkEyXkFqcGdeQXVyNjQ2MjQ5NzM@._V1_SX300.jpg"
# Extrai apenas o atributo especificado.
xmlGetAttr(root[[1]], name = "title")
## [1] "Resident Evil"
xmlGetAttr(root[[1]], name = "year")
## [1] "2002"

Percorrer a árvore

# Cria uma lista onde cada child (filme) é um elemento da lista.
child <- xmlChildren(root)
str(child)
## List of 10
##  $ result:Classes 'XMLInternalElementNode', 'XMLInternalNode', 'XMLAbstractNode' <externalptr> 
##  $ result:Classes 'XMLInternalElementNode', 'XMLInternalNode', 'XMLAbstractNode' <externalptr> 
##  $ result:Classes 'XMLInternalElementNode', 'XMLInternalNode', 'XMLAbstractNode' <externalptr> 
##  $ result:Classes 'XMLInternalElementNode', 'XMLInternalNode', 'XMLAbstractNode' <externalptr> 
##  $ result:Classes 'XMLInternalElementNode', 'XMLInternalNode', 'XMLAbstractNode' <externalptr> 
##  $ result:Classes 'XMLInternalElementNode', 'XMLInternalNode', 'XMLAbstractNode' <externalptr> 
##  $ result:Classes 'XMLInternalElementNode', 'XMLInternalNode', 'XMLAbstractNode' <externalptr> 
##  $ result:Classes 'XMLInternalElementNode', 'XMLInternalNode', 'XMLAbstractNode' <externalptr> 
##  $ result:Classes 'XMLInternalElementNode', 'XMLInternalNode', 'XMLAbstractNode' <externalptr> 
##  $ result:Classes 'XMLInternalElementNode', 'XMLInternalNode', 'XMLAbstractNode' <externalptr> 
##  - attr(*, "class")= chr [1:2] "XMLInternalNodeList" "XMLNodeList"
f <- function(node) {
    c(title = xmlGetAttr(node, name = "title"),
      year = xmlGetAttr(node, name = "year"))
}

# Aplica uma função em cada elemento da lista.
sapply(child, FUN = f)
##       result          result                     
## title "Resident Evil" "Resident Evil: Apocalypse"
## year  "2002"          "2004"                     
##       result                      result                    
## title "Resident Evil: Extinction" "Resident Evil: Afterlife"
## year  "2007"                      "2010"                    
##       result                      
## title "Resident Evil: Retribution"
## year  "2012"                      
##       result                            
## title "Resident Evil: The Final Chapter"
## year  "2016"                            
##       result                        result                    
## title "Resident Evil: Degeneration" "Resident Evil: Damnation"
## year  "2008"                        "2012"                    
##       result            result           
## title "Resident Evil 4" "Resident Evil 5"
## year  "2005"            "2009"
# xmlApply(x, ...) = lapply(xmlChildren(x), ...)
xmlApply(root, FUN = f)
## $result
##           title            year 
## "Resident Evil"          "2002" 
## 
## $result
##                       title                        year 
## "Resident Evil: Apocalypse"                      "2004" 
## 
## $result
##                       title                        year 
## "Resident Evil: Extinction"                      "2007" 
## 
## $result
##                      title                       year 
## "Resident Evil: Afterlife"                     "2010" 
## 
## $result
##                        title                         year 
## "Resident Evil: Retribution"                       "2012" 
## 
## $result
##                              title 
## "Resident Evil: The Final Chapter" 
##                               year 
##                             "2016" 
## 
## $result
##                         title                          year 
## "Resident Evil: Degeneration"                        "2008" 
## 
## $result
##                      title                       year 
## "Resident Evil: Damnation"                     "2012" 
## 
## $result
##             title              year 
## "Resident Evil 4"            "2005" 
## 
## $result
##             title              year 
## "Resident Evil 5"            "2009"
# xmlSApply(x, ...) = sapply(xmlChildren(x), ...)
# S: simplifica quando possível.
xmlSApply(root, FUN = f)
##       result          result                     
## title "Resident Evil" "Resident Evil: Apocalypse"
## year  "2002"          "2004"                     
##       result                      result                    
## title "Resident Evil: Extinction" "Resident Evil: Afterlife"
## year  "2007"                      "2010"                    
##       result                      
## title "Resident Evil: Retribution"
## year  "2012"                      
##       result                            
## title "Resident Evil: The Final Chapter"
## year  "2016"                            
##       result                        result                    
## title "Resident Evil: Degeneration" "Resident Evil: Damnation"
## year  "2008"                        "2012"                    
##       result            result           
## title "Resident Evil 4" "Resident Evil 5"
## year  "2005"            "2009"
placat <- "https://www.w3schools.com/Xml/plant_catalog.xml"

library(RCurl)
doc <- xmlParse(getURL(placat))
root <- xmlRoot(doc)
xmlSize(root)
## [1] 36
xmlName(root)
## [1] "CATALOG"
names(root)
##   PLANT   PLANT   PLANT   PLANT   PLANT   PLANT   PLANT   PLANT 
## "PLANT" "PLANT" "PLANT" "PLANT" "PLANT" "PLANT" "PLANT" "PLANT" 
##   PLANT   PLANT   PLANT   PLANT   PLANT   PLANT   PLANT   PLANT 
## "PLANT" "PLANT" "PLANT" "PLANT" "PLANT" "PLANT" "PLANT" "PLANT" 
##   PLANT   PLANT   PLANT   PLANT   PLANT   PLANT   PLANT   PLANT 
## "PLANT" "PLANT" "PLANT" "PLANT" "PLANT" "PLANT" "PLANT" "PLANT" 
##   PLANT   PLANT   PLANT   PLANT   PLANT   PLANT   PLANT   PLANT 
## "PLANT" "PLANT" "PLANT" "PLANT" "PLANT" "PLANT" "PLANT" "PLANT" 
##   PLANT   PLANT   PLANT   PLANT 
## "PLANT" "PLANT" "PLANT" "PLANT"
root[[1]]
## <PLANT>
##   <COMMON>Bloodroot</COMMON>
##   <BOTANICAL>Sanguinaria canadensis</BOTANICAL>
##   <ZONE>4</ZONE>
##   <LIGHT>Mostly Shady</LIGHT>
##   <PRICE>$2.44</PRICE>
##   <AVAILABILITY>031599</AVAILABILITY>
## </PLANT>
names(root[[1]])
##         COMMON      BOTANICAL           ZONE          LIGHT 
##       "COMMON"    "BOTANICAL"         "ZONE"        "LIGHT" 
##          PRICE   AVAILABILITY 
##        "PRICE" "AVAILABILITY"
root[[1]][["BOTANICAL"]]
## <BOTANICAL>Sanguinaria canadensis</BOTANICAL>
# xmlSApply(root,
#           FUN = function(node) {
#               node[["PRICE"]]
#           })

# Idem ao código acima comentado.
head(xmlSApply(root, FUN = "[[", "PRICE"))
## $PLANT
## <PRICE>$2.44</PRICE> 
## 
## $PLANT
## <PRICE>$9.37</PRICE> 
## 
## $PLANT
## <PRICE>$6.81</PRICE> 
## 
## $PLANT
## <PRICE>$9.90</PRICE> 
## 
## $PLANT
## <PRICE>$6.44</PRICE> 
## 
## $PLANT
## <PRICE>$9.03</PRICE>
xmlSApply(root,
          FUN = function(node) {
              xmlValue(node[["PRICE"]])
          })
##   PLANT   PLANT   PLANT   PLANT   PLANT   PLANT   PLANT   PLANT 
## "$2.44" "$9.37" "$6.81" "$9.90" "$6.44" "$9.03" "$4.45" "$3.99" 
##   PLANT   PLANT   PLANT   PLANT   PLANT   PLANT   PLANT   PLANT 
## "$3.23" "$2.98" "$2.80" "$5.59" "$6.59" "$3.90" "$3.20" "$9.04" 
##   PLANT   PLANT   PLANT   PLANT   PLANT   PLANT   PLANT   PLANT 
## "$6.94" "$9.58" "$8.86" "$9.16" "$4.59" "$7.16" "$9.80" "$2.57" 
##   PLANT   PLANT   PLANT   PLANT   PLANT   PLANT   PLANT   PLANT 
## "$9.34" "$2.78" "$7.06" "$6.56" "$7.81" "$8.56" "$9.26" "$4.36" 
##   PLANT   PLANT   PLANT   PLANT 
## "$7.89" "$8.60" "$5.63" "$3.02"

Resumo das funções

Função Classe
xmlParse XMLInternalDocument
xmlRoot XMLInternalElementNode
[, [[ XMLInternalNodeList


Função Descrição
xmlName Nome do elemento
xmlSize Número de filhos
xmlAttrs Vetor com os atributos
xmlGetAttr Apenas um atributo pelo nome
xmlValue Conteúdo do elemento
xmlChildren Lista com os filhos
xmlParent Elemento pai


Função Descrição
xmlApply Aplica função aos filhos de um elemento
xmlSApply Idem, mas simplifica quando possível

Resumo

Próxima semana

Referências