背景

pubchunks 的目的是从XML格式的学术文章中获取部分数据。我们不需要了解XML及其所有的格式原理。只需知道我们的文件或XML字符串在哪里以及我们想要每篇文章的哪些部分。然后用户可以组合这些部分并做我们希望下游的任何事情; 例如,分析文本结构

pubchunks中的函数

两个主要功能是:

  1. pub_chunks():获取XML部分
  2. pub_tabularize():强制输出pub_chunks()到data.frame中
  3. pub_guess_publisher():从XML文件或字符串猜测发布者
  4. pub_sections():部分pubchunks知道如何处理
  5. pub_providers():提供者(即发布者)pubchunks知道如何明确处理

支持的出版商

  1. elife
  2. plos
  3. elsevier
  4. hindawi
  5. pensoft
  6. peerj
  7. copernicus
  8. frontiers
  9. f1000research

支持提取的部分有:

  1. Front - 发布者,期刊和文章元数据元素
  2. Body - 文章的正文
  3. Back - 文章的背面,致谢,作者贡献,参考文献
  4. Title - 文章标题
  5. Doi - 文章doi
  6. Categories - 发布商的类别,如果有的话
  7. Author - 作者
  8. Aff - 隶属关系(包括作者姓名)
  9. Keyword - 关键字
  10. Abstract - 文章摘要
  11. Executive_summary - 文章执行摘要
  12. Refs - 参考文献
  13. Refs_dois - 参考dois - 如果有的话
  14. Publisher - 发布者名称
  15. Journal_meta - 期刊元数据
  16. Article_meta - 文章元数据
  17. Acknowledgments - 致谢
  18. Permissions - 文章权限
  19. History - 日期,收到,出版,接受等

安装

#install.packages("pubchunks")
#
#remotes::install_github("ropensci/pubchunks")
#
#remotes::install_github("ropensci/pubchunks@fix-pub_chunks")

library(pubchunks)

Import

  1. XML Path(s)
#path
path <- system.file("examples/pensoft_1.xml", package = "pubchunks")
#paths 
pensoft_xml <- system.file("examples/pensoft_1.xml", package = "pubchunks")
peerj_xml <- system.file("examples/peerj_1.xml", package = "pubchunks")
copernicus_xml <- system.file("examples/copernicus_1.xml", package = "pubchunks")
frontiers_xml <- system.file("examples/frontiers_1.xml", package = "pubchunks")
paths <- list(pensoft_xml, peerj_xml, copernicus_xml, frontiers_xml)
pub_chunks(
  paths,
  sections = c("abstract", "title", "authors", "refs")
)
## [[1]]
## <pub chunks>
##   from: character
##   sections: abstract, title, authors, refs
##    abstract (n=1): AbstractNineteen species of seed-beetles belonging ...
##    title (n=1): Contribution to the knowledge of seed-beetles (Col ...
##    authors (n=7): nested list
##    refs (n=13): AntonKW (2010) Catalogue of Palaearctic Coleoptera
## 
## [[2]]
## <pub chunks>
##   from: character
##   sections: abstract, title, authors, refs
##    abstract (n=1): Climate change is predicted to lead to more extrem ...
##    title (n=1): Storm effects on intertidal invertebrates: increas ...
##    authors (n=7): nested list
##    refs (n=60): Alcántara-Carrió et al. (2017)Alcántara-CarrióJSas
## 
## [[3]]
## <pub chunks>
##   from: character
##   sections: abstract, title, authors, refs
##    abstract (n=1): Soil temperatures at various depths are unique par ...
##    title (n=1): Quality control of 10-min soil temperatures data a ...
##    authors (n=3): nested list
##    refs (n=9): 1Bertrand, C., Gonzalez Sotelino, L., and Journée,
## 
## [[4]]
## <pub chunks>
##   from: character
##   sections: abstract, title, authors, refs
##    abstract (n=1): Our current understanding of Antarctic soils is de ...
##    title (n=1): Metagenomic Analysis of a Southern Maritime Antarc ...
##    authors (n=8): nested list
##    refs (n=56): AislabieJ.BroadyP.SaulD. (2006). Culturable hetero
## 
## attr(,"ft_data")
## [1] FALSE
x <- path
pub_chunks(x, sections = "abstract")
## <pub chunks>
##   from: character
##   sections: abstract
##    abstract (n=1): AbstractNineteen species of seed-beetles belonging ...
pub_chunks(x, sections = "aff")
## <pub chunks>
##   from: character
##   sections: aff
##    aff (n=7): nested list
pub_chunks(x, sections = c("abstract", "title", "authors", "refs"))
## <pub chunks>
##   from: character
##   sections: abstract, title, authors, refs
##    abstract (n=1): AbstractNineteen species of seed-beetles belonging ...
##    title (n=1): Contribution to the knowledge of seed-beetles (Col ...
##    authors (n=7): nested list
##    refs (n=13): AntonKW (2010) Catalogue of Palaearctic Coleoptera
  1. Strings
xml_str <- paste0(readLines(path), collapse = "\n")

pub_chunks(xml_str, sections = "title")
## <pub chunks>
##   from: character
##   sections: title
##    title (n=1): Contribution to the knowledge of seed-beetles (Col ...
  1. xml_document
xml_doc <- xml2::read_xml(path)
class(xml_doc)
## [1] "xml_document" "xml_node"
pub_chunks(xml_doc, sections = "title")
## <pub chunks>
##   from: xml_document
##   sections: title
##    title (n=1): Contribution to the knowledge of seed-beetles (Col ...
  1. Doi
library("fulltext")
dois <- c('10.1371/journal.pone.0086169', '10.1371/journal.pone.0155491', 
  '10.7554/eLife.03032')
x <- fulltext::ft_get(dois) %>% fulltext::ft_collect()

pub_chunks(x, sections="authors")
## $plos
## $plos$`10.1371/journal.pone.0086169`
## <pub chunks>
##   from: xml_document
##   sections: authors
##    authors (n=4): nested list
## 
## $plos$`10.1371/journal.pone.0155491`
## <pub chunks>
##   from: xml_document
##   sections: authors
##    authors (n=9): nested list
## 
## 
## $elife
## $elife$`10.7554/eLife.03032`
## <pub chunks>
##   from: xml_document
##   sections: authors
##    authors (n=6): nested list
## 
## 
## attr(,"ft_data")
## [1] TRUE

Output

  1. List: default

  2. Dataframe: 输出表格

path <- system.file("examples/elife_1.xml", package = "pubchunks")
res <- pub_chunks(x, c("doi", "title", "keywords"))
pub_tabularize(res)
## $plos
## $plos$`10.1371/journal.pone.0086169`
##                            doi
## 1 10.1371/journal.pone.0086169
##                                                                                                               title
## 1 Holsteins Favor Heifers, Not Bulls: Biased Milk Production Programmed during Pregnancy as a Function of Fetal Sex
##   .publisher
## 1       plos
## 
## $plos$`10.1371/journal.pone.0155491`
##                            doi
## 1 10.1371/journal.pone.0155491
##                                                                                                            title
## 1 Uterine Expression of NDRG4 Is Induced by Estrogen and Up-Regulated during Embryo Implantation Process in Mice
##   .publisher
## 1       plos
## 
## 
## $elife
## $elife$`10.7554/eLife.03032`
##                   doi                                          title
## 1 10.7554/eLife.03032 MicroRNA-mediated repression of nonsense mRNAs
## 2 10.7554/eLife.03032 MicroRNA-mediated repression of nonsense mRNAs
## 3 10.7554/eLife.03032 MicroRNA-mediated repression of nonsense mRNAs
## 4 10.7554/eLife.03032 MicroRNA-mediated repression of nonsense mRNAs
## 5 10.7554/eLife.03032 MicroRNA-mediated repression of nonsense mRNAs
## 6 10.7554/eLife.03032 MicroRNA-mediated repression of nonsense mRNAs
##                       keywords .publisher
## 1                     microRNA      elife
## 2            nonsense mutation      elife
## 3 nonsense-mediated mRNA decay      elife
## 4                          APC      elife
## 5             intron retention      elife
## 6  premature termination codon      elife
#paths
paths <- list(pensoft_xml, peerj_xml, copernicus_xml, frontiers_xml)
out <- pub_chunks(
  paths,
  sections = c("doi", "title", "keywords")
)
pub_tabularize(out)
## [[1]]
##                                                                                                     title
## 1 Contribution to the knowledge of seed-beetles (Coleoptera, Chrysomelidae, Bruchinae) in Xinjiang, China
##   .publisher
## 1    pensoft
## 
## [[2]]
##                                                                                                title
## 1 Storm effects on intertidal invertebrates: increased beta diversity of few individuals and species
##   .publisher
## 1      peerj
## 
## [[3]]
##                      doi
## 1 10.5194/asr-12-23-2015
##                                                     title .publisher
## 1 Quality control of 10-min soil temperatures data at RMI copernicus
## 
## [[4]]
##                        doi
## 1 10.3389/fmicb.2012.00403
##                                                        title .publisher
## 1 Metagenomic Analysis of a Southern Maritime Antarctic Soil  frontiers
#rbind a list of dataframe
data.table::rbindlist(pub_tabularize(out), fill = TRUE)
##                                                                                                      title
## 1: Contribution to the knowledge of seed-beetles (Coleoptera, Chrysomelidae, Bruchinae) in Xinjiang, China
## 2:      Storm effects on intertidal invertebrates: increased beta diversity of few individuals and species
## 3:                                                 Quality control of 10-min soil temperatures data at RMI
## 4:                                              Metagenomic Analysis of a Southern Maritime Antarctic Soil
##    .publisher                      doi
## 1:    pensoft                     <NA>
## 2:      peerj                     <NA>
## 3: copernicus   10.5194/asr-12-23-2015
## 4:  frontiers 10.3389/fmicb.2012.00403