Module: Extcite

Defined in:: lib/extcite.rb,
lib/extcite/version.rb

Constant Summary collapse

VERSION =

"0.4.1"

Class Method Summary collapse

.cont_neg(ids:) ⇒ Object

Get citation(s) using Crossref content negotation.
.extract(path:, file: "out.bib", output: "bib") ⇒ Object

Extract DOIs from one or more PDFs.
.extract_dois(path:) ⇒ Object

Extract DOIs from one or more PDFs after extracting text.
.extract_from_metadata(path:, try_regex: true) ⇒ Object

Try to extract DOIs from one or more PDF metadata sections.
.extract_text(path:) ⇒ Object

Extract text from a pdf, or many pdfs.
.get_ids(txt:) ⇒ Object

Get DOIs from a String or Array of String's.

Class Method Details

.cont_neg(ids:) ⇒ `Object`

Get citation(s) using Crossref content negotation

Return: an string of bib data

Examples:

require 'extcite'
Extcite.cont_neg(ids: "10.1016/j.dendro.2014.01.004")

Parameters:

ids (Array[String]) —

One or more DOIs in an array

# File 'lib/extcite.rb', line 305

def self.cont_neg(ids:)
  out = Serrano.content_negotiation(ids: ids)
  return out
end

.extract(path:, file: "out.bib", output: "bib") ⇒ `Object`

Extract DOIs from one or more PDFs

Return: writes bib files to a .bib file or an array if file is nil

When writing to a file, `extract` by default appends to the end
of the file so you can build up your bibtex file with your
citations

Examples:

require 'extcite'
require 'faraday'# get a paper in pdf format

path = '2068.pdf'
res = Faraday.new(:url => "https://peerj.com/articles/2068.pdf").get;
f = File.new(path, "wb");
f.write(res.body);
f.close()# extract doi from the pdf

Extcite.extract(path: path)
Extcite.extract(path: path, file: nil)

Parameters:

path (String) —

Path to a pdf file, or a folder of PDF files
file (String) (defaults to: "out.bib") —

File name to write data to - or nil to stdout
output (String) (defaults to: "bib") —

Typeo of output. only bibtex for now

# File 'lib/extcite.rb', line 38

def self.extract(path:, file: "out.bib", output: "bib")
  path = make_paths(path)
  path.each do |x|
    # try PDF metadata first
    ids = nil
    rr = PDF::Reader.new(x);
    pdfmeta = rr.metadata
    if !pdfmeta.nil?
      xml = Oga.parse_xml(pdfmeta);
      begin
        tt = xml.xpath('//rdf:Description')        # try dc:identifier attribute

        ss = tt.attr('dc:identifier')[0]
        if !ss.nil?
          ids = ss.text.sub(/doi:/, '')
        else
          # try prism:doi node
          pdoi = xml.xpath('//rdf:Description//prism:doi')
          if pdoi.length == 1
            ids = pdoi.text
          else
            # try pdf:WPS-ARTICLEDOI node
            wpsdoi = xml.xpath('//rdf:Description//pdf:WPS-ARTICLEDOI')
            if wpsdoi.length == 1
              ids = wpsdoi.text
            else
              # try pdfx:WPS-ARTICLEDOI node
              pdfxwpsdoi = xml.xpath('//rdf:Description//pdfx:WPS-ARTICLEDOI')
              if pdfxwpsdoi.length == 1
                ids = pdfxwpsdoi.text
              else
                ids = nil
              end
            end
          end
        end
      rescue
        ids = nil
      end
    end

    # if not found, try regexing for DOI
    if ids.nil?
      ids = Extcite.get_ids(txt: Extcite.extract_text_one(x))
    end

    if ids.length == 0
      puts "no DOI found in " + x
    else
      if !ids.match(/arxiv/i).nil? && ids.length < 200
        conn = Faraday.new(:url => 'http://export.arxiv.org/api/query?id_list=' + ids.gsub(/arxiv:/i, '')).get
        bibs = conn.body.make_bib_arxiv(ids.gsub(/arxiv:/i, ''))
      else
        bibs = Extcite.cont_neg(ids: ids)
      end

      # if an error or not found, skip
      bibstest = nil
      if bibs.class == Array
        bibstest = bibs[0]
      else
        bibstest = bibs
      end

      if !bibstest.nil?
        if !bibstest.match(/error|not found/i).nil? || !bibstest.match(/<\/html>/i).nil?
          puts "DOI found: " + ids + " ; but citation not found via content negotation - passing"          # do something else?

        else
          if file.nil?
            return bibstest
          else
            puts "writing " + ids + " to " + file
            bibs.write_bib(file)
          end
        end
      end
    end
  end
end

.extract_dois(path:) ⇒ `Object`

Extract DOIs from one or more PDFs after extracting text

Examples:

require 'extcite'
require 'faraday'# get a paper in pdf format

path = '2068.pdf'
res = Faraday.new(:url => "https://peerj.com/articles/2068.pdf").get
f = File.new(path, "wb")
f.write(res.body)
f.close()# extract doi from the pdf

Extcite.extract_dois(path: path)

Parameters:

path (String) —

Path to a pdf file, or a folder of PDF files

# File 'lib/extcite.rb', line 219

def self.extract_dois(path:)
  txt = Extcite.extract_text(path: path)  # return txt.map { |z| z.match("[0-9]+\\.[0-9]+/.+").to_s.gsub(/\s.+/, '') }

  return Extcite.get_ids(txt: txt)
end

.extract_from_metadata(path:, try_regex: true) ⇒ `Object`

Try to extract DOIs from one or more PDF metadata sections

Return: DOI string

Examples:

require 'extcite'
require 'faraday'# get a paper in pdf format

path = '2068.pdf'
res = Faraday.new(:url => "https://peerj.com/articles/2068.pdf").get;
f = File.new(path, "wb");
f.write(res.body)
f.close()# extract doi from the pdf

Extcite.extract_from_metadata(path: path)

Parameters:

path (String) —

Path to a pdf file, or a folder of PDF files. required
try_regex (true, false) (defaults to: true) —

If no PDF metadata, try extracting using regex on the text. Uses `Extcite.extract_text_one` to extract text, then `Extcite.get_ids` to extract a DOI if any found

# File 'lib/extcite.rb', line 140

def self.extract_from_metadata(path:, try_regex: true)
  path = make_paths(path)
  path.each do |x|
    # try PDF metadata first
    ids = nil
    rr = PDF::Reader.new(x);
    pdfmeta = rr.metadata
    if !pdfmeta.nil?
      begin
        xml = Oga.parse_xml(pdfmeta);
      rescue Exception => e
        xml = nil
      end

      if !xml.nil?
        begin
          tt = xml.xpath('//rdf:Description')          # try dc:identifier attribute

          ss = tt.attr('dc:identifier')[0]
          if !ss.nil?
            ids = ss.text.sub(/doi:/, '')
          else
            # try prism:doi node
            pdoi = xml.xpath('//rdf:Description//prism:doi')
            if pdoi.length == 1
              ids = pdoi.text
            else
              # try pdf:WPS-ARTICLEDOI node
              wpsdoi = xml.xpath('//rdf:Description//pdf:WPS-ARTICLEDOI')
              if wpsdoi.length == 1
                ids = wpsdoi.text
              else
                # try pdfx:WPS-ARTICLEDOI node
                pdfxwpsdoi = xml.xpath('//rdf:Description//pdfx:WPS-ARTICLEDOI')
                if pdfxwpsdoi.length == 1
                  ids = pdfxwpsdoi.text
                else
                  # try dc:source
                  dc_source = xml.xpath('//dc:source')
                  if dc_source.length == 1
                    ids = dc_source.text.match(/10\..+/).to_s
                  else
                    ids = nil
                  end
                end
              end
            end
          end
        rescue
          ids = nil
        end
      end
    end

    # if not found, try regexing for DOI
    if ids.nil? and try_regex
      ids = Extcite.get_ids(txt: Extcite.extract_text_one(x))
    end

    return ids
  end
end

.extract_text(path:) ⇒ `Object`

Extract text from a pdf, or many pdfs

This method is used internally within fetch to parse PDFs.

Examples:

require 'extcite'
require 'faraday'# get a paper in pdf format

path = '2068.pdf'
res = Faraday.new(:url => "https://peerj.com/articles/2068.pdf").get
f = File.new(path, "wb")
f.write(res.body)
f.close()# extract doi from the pdf

Extcite.extract_text(path: path)

Parameters:

path (String) —

Path to a pdf file, or a folder of PDF files

# File 'lib/extcite.rb', line 272

def self.extract_text(path:)
  path = Array(path)
  if path.length == 1
    if File.directory?(path[0])      # keep only files with .pdf extension

      path = dir_files(path[0]).keep_if { |z| !!z.match(/.pdf/) }
    end
  end

  out = []
  path.each do |x|
    begin
      rr = PDF::Reader.new(x)
      txt = rr.pages.map { |page| page.text }.join("\n")
    rescue Exception => e
      warn e
      txt = ""
    end
    out << txt
  end
  return out
end

.get_ids(txt:) ⇒ `Object`

Get DOIs from a String or Array of String's

Return: Array of DOIs

Examples:

require 'extcite'
Extcite.get_ids(txt: '10.1016/j.dendro.2014.01.004 adfasdf asd fas df asdfsd')

Parameters:

txt (String) —

String or Array of String's

# File 'lib/extcite.rb', line 235

def self.get_ids(txt:)
  # see if there's

  return Array(txt).map { |z|
    # detect if is an arxiv paper
    if !z.match(/arxiv:[0-9]+\.[0-9A-Za-z]+/i).nil?      # if so, return arxiv id for later extraction of arxiv citation via their API

      z = z.match(/arxiv:[0-9]+\.[0-9A-Za-z]+/i).to_s
    else
      doi_pattern = '(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![%"#? ])\\S)+)'
      z = z.match(doi_pattern).to_s.gsub(/\s.+/, '')      # z = z.match("10\\.[0-9]+/.+").to_s.gsub(/\s.+/, '')

    end    # clean up doi

    z = z.gsub(/\.$|\.;$|\.\]$|\.\}$|\.\)$|,$/, '')
    return z.gsub(/;$|\]$|\}$|\)$/, '')
  }[0]
end

Module: Extcite

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.cont_neg(ids:) ⇒ Object

.extract(path:, file: "out.bib", output: "bib") ⇒ Object

.extract_dois(path:) ⇒ Object

.extract_from_metadata(path:, try_regex: true) ⇒ Object

.extract_text(path:) ⇒ Object

.get_ids(txt:) ⇒ Object

.cont_neg(ids:) ⇒ `Object`

.extract(path:, file: "out.bib", output: "bib") ⇒ `Object`

.extract_dois(path:) ⇒ `Object`

.extract_from_metadata(path:, try_regex: true) ⇒ `Object`

.extract_text(path:) ⇒ `Object`

.get_ids(txt:) ⇒ `Object`