Living Review

Introduction

This file is part of the repository hosted at https://gitlab.com/extending-the-earcheck/living-review, and its rendered version is hosted by GitLab Pages at https://extending-the-earcheck.gitlab.io/living-review. The Google Sheet holding the extraction script template is available from https://docs.google.com/spreadsheets/d/1duDKLMmhel_5fBPhF_H-0Dbic896-5eCef9bBCSJMco.

if (!(installed.packages()['ufs', 'Version'] >= "0.4")) {
  stop("You need to have at least version 0.4 of the `ufs` package installed; ",
       "install it with:\n\ninstall.packages('ufs');");
}

# devtools::load_all("B:/Data/R/metabefor");
### Get dev version of metabefor
ufs::quietGitLabUpdate("r-packages/metabefor", quiet = FALSE);

## Skipping install of 'metabefor' from a gitlab remote, the SHA1 (4f414c33) has not changed since last install.
##   Use `force = TRUE` to force installation

### Get version of revtools with bugfix
# ufs::quietRemotesInstall("matherion/revtools@bibtex-importing-bugfix",
#                          func = "install_github", quiet = FALSE);
# ufs::quietRemotesInstall("matherion/synthesisr@bibtex-import-bugfix",
#                          func = "install_github", quiet = FALSE);
ufs::quietRemotesInstall("rmetaverse/synthesisr",
                         func = "install_github", quiet = FALSE);

## Skipping install of 'synthesisr' from a github remote, the SHA1 (c406bc9f) has not changed since last install.
##   Use `force = TRUE` to force installation

### Get additional packages
ufs::checkPkgs('here');             ### To easily access files
                                    ### using 'relative paths'
ufs::checkPkgs("googlesheets4");    ### To import data from google
                                    ### sheets in metabefor
ufs::checkPkgs("synthesisr");       ### To import RIS files
ufs::checkPkgs("stringdist");       ### To compute string distances

###-----------------------------------------------------------------------------
### Settings
###-----------------------------------------------------------------------------

### By default show R code and don't comment output
knitr::opts_chunk$set(echo = TRUE);
knitr::opts_chunk$set(comment = NA);

runScreeningChunks = FALSE;
runExtractionChunks = FALSE;
importExtractedStudiesFromRxsFiles <- FALSE;
runStudyTreePreparation <- FALSE;

###-----------------------------------------------------------------------------
### Paths
###-----------------------------------------------------------------------------

### Set base path
basePath <- here::here();

### Set path for query hit exports
queryHitExportPath <- file.path(basePath, "search");

### Set path for screening
screeningPath <- file.path(basePath, "screening");

### Set path for any rxs specifications
rxsSpecificationsPath <-
  file.path(basePath, "rxs-specifications");

### Set path for extraction script template
extractionScriptTemplatePath <-
  file.path(basePath, "extraction");

extractionScriptPath <-
  file.path(basePath, "extraction", "Extracted studies");

### Set path for intermediate output
outputPath <- file.path(basePath, "output");

### Set path for data files
dataPath <- file.path(basePath, "data");

###-----------------------------------------------------------------------------
### Extraction Script and Aggregation Tree Google sheets URLs
###-----------------------------------------------------------------------------

sheetsURL <- paste0("https://docs.google.com/spreadsheets/d/",
                    "1duDKLMmhel_5fBPhF_H-0Dbic896-5eCef9bBCSJMco");

aggregationTreeURL <- paste0("https://docs.google.com/spreadsheets/d/",
                             "1P5IZekDxAiW3B_TLz392wd68alv4HFF5eTnoPwpGs_Q");

txsIDs <-
 c("12WldMPYFkJIU2VaYTg0Ysj-kiSe32eUY25Dghcnochs",  ### theories
   "18DQ0tPqqsiSIrrXJkAZ4MNsH27i2fSX588XwdUam8uo",  ### hp-measure
  "1XOi1bgmewvVeorQcNXNZHGVwi_PEQxRO2l_6vOzJst8",   ### hp-precise
  "1bpcXEXQYJcmilX4F_ZlJ5RsO2B-Ekkz0B2mVnygUEso",   ### pld-use
  "1DQZtdLYhYNa6CISxOrWpyvkEsarHDr9LaeuAyGir4d4",   ### barrier
  "1wg4PqFnZ2-L6MLYvOPOD4pUy_JbmcA5wlIxDeHwzf-U",   ### participant-categories
   "1KysisDj5bG9j8F4uIU_7C_jIBhA3KeUvKlOCGNHVBCM",  ### grey-literature
  "1FSxP6rdAtJckLrSfIShDV0kkU99edPbQclBTTqebg2I",   ### scale ranges
  NULL);

###-----------------------------------------------------------------------------
### Extraction administration
###-----------------------------------------------------------------------------

extraction_administration_columns_all <- 
  c("qurid", "doi", "title", "year", "author",
    "included", "fulltext", "extractor", "exclude_check",
    "exclude_final", "finished", "comments");

extraction_administration_columns_firstBatch <-
  c(extraction_administration_columns_all,
    "screening_stage1", "included_by_B_not_by_A",
    "screenerastatus", "screenerbstatus1",
    "screeneraconfidence", "screenerbconfidence");

extraction_administration_columns_secondBatch <-
  c(extraction_administration_columns_all,
    "screening_stage2", "included_by_B_not_by_A",
    "screener_a_status_r2", "screenerbstatus1",
    "screeneraconfidence", "screenerbconfidence");

###-----------------------------------------------------------------------------
### Import rxs specifications
###-----------------------------------------------------------------------------

### devtools::load_all("B:/Data/R/metabefor");

options(metabefor.debug = TRUE);

fullObject <-
  metabefor:::rxs_fromSpecifications(
    gs_url = sheetsURL,
    entitiesFilename = file.path(rxsSpecificationsPath,
                                 "entities-local-copy.csv"),
    valueTemplatesFilename = file.path(rxsSpecificationsPath,
                                       "valueTemplates-local-copy.csv"),
    localBackup = list(
      entities = file.path(rxsSpecificationsPath,
                           "entities-local-copy.csv"),
      valueTemplates = file.path(rxsSpecificationsPath,
                                 "valueTemplates-local-copy.csv"),
      definitions = NULL
    ),
    outputFile = file.path(
      extractionScriptTemplatePath,
      "extractionScriptTemplate.rxs.Rmd"
    ),
    instructionHeadingLevel = 4,
    returnFullObject = TRUE
  );

v Reading from "rxs-template-EarAct".

v Range ''entities''.

v Reading from "rxs-template-EarAct".

v Range ''valueTemplates''.

v Reading from "rxs-template-EarAct".

v Range ''definitions''.

v Reading from "rxs-template-EarAct".

v Range ''instructions''.

Successfully read the extraction script specifications from Google sheets.
Stored local backup of entities to 'C:/Users/tdeze/Documents/Earcheck/living-review/rxs-specifications/entities-local-copy.csv'.
Stored local backup of value templates to 'C:/Users/tdeze/Documents/Earcheck/living-review/rxs-specifications/valueTemplates-local-copy.csv'.

rxs_parseEntities read an entity spreadsheet with the following columns: 'identifier', 'parent', 'title', 'description', 'valueTemplate', 'validValues', 'default', 'examples', 'entityRef', 'fieldRef', 'owner', 'list', 'collapsing', 'repeating', 'recurring', 'recursing' & 'identifying'.
Parsed extraction script specifications into extraction script template.
Created diagrams representing the extraction tree.
Successfully wrote extraction script template to 'C:/Users/tdeze/Documents/Earcheck/living-review/extraction/extractionScriptTemplate.rxs.Rmd'.

Method

Screening

Importing hits

Here the search results are imported, deduplicated, and prepared for the screeners.

originFile_col <- "origin_file";
originDatabase_col <- "origin_database";
originInterface_col <- "origin_interface";
originDate_col <- "origin_date";

### Get all subdirectories; search hits are placed in alphabetically
### ordered subdirectories.
bibliographyHitDirs <-
  list.dirs(
    queryHitExportPath,
    full.names = FALSE,
    recursive = FALSE
  );

### Sort alphabetically
bibliographyHitDirs <-
  sort(bibliographyHitDirs);

### Get all files in each subdirectory
bibliographyHitFiles <-
  lapply(
    file.path(queryHitExportPath, bibliographyHitDirs),
    list.files,
    full.names = TRUE,
    pattern = "\\.ris$",
    ignore.case = TRUE,
    recursive = TRUE
  );
bibliographyHitFiles <-
  lapply(
    bibliographyHitFiles,
    grep,
    pattern = "_preprocessed",
    ignore.case = TRUE,
    invert = TRUE,
    value = TRUE
  );
names(bibliographyHitFiles) <- bibliographyHitDirs;

### Get a list of subdirectories with one file or more
bibliographyHitFiles <-
  bibliographyHitFiles[
    unlist(
      lapply(
        bibliographyHitFiles,
        length
      )
    ) > 0
  ];

### Process the files in these subdirectories, reading all files
### separately to keep track of every entry's origin
bibliographyHits <-
  lapply(
    names(bibliographyHitFiles),
    function(currentFileList) {
      cat("\nStarting to read directory '", currentFileList, "':\n", sep="");
      return(
        lapply(
          bibliographyHitFiles[[currentFileList]],
          function(currentFile) {
            cat("\nStarting to read file '", currentFile, "':\n", sep="");
            
            cat("\nPreprocessing file.\n");
            
            newFileName <- paste0(
              tools::file_path_sans_ext(currentFile),
              "_preprocessed_for_synthesisr.",
              tools::file_ext(currentFile)
            );
            
            fileAsText <-
              readLines(currentFile);
            
            # fileAsText <-
            #   gsub("^T1", "TI", fileAsText);
            
            # ### Remove all lines that don't start with a RIS tag
            # fileAsText <-
            #   fileAsText[grepl("^\\s*$|^[A-Z][A-Z0-9]\\s{2}-\\s", fileAsText)];
            # ### Remove C1 lines (weird stuff Ebsco Embase exports added in)
            # fileAsText <-
            #   fileAsText[!grepl("^C1\\s{2}-\\s", fileAsText)];
            
            ### Get all lines that start with TY and ER; depends on whether
            ### the file is a pubmed export
            if (any(grepl("PMID-", fileAsText))) {
              TY_regex <- "PMID-";
              ER_regex <- "SO  -"
              ### Actually, no further processing required; pubmed isn't
              ### where the problem is
            } else {
              TY_regex <- "TY  -";
              ER_regex <- "ER  -"
              TY_lines <- grep(TY_regex, fileAsText);
              ER_lines <- grep(ER_regex, fileAsText);
              
              if (length(TY_lines) != length(ER_lines)) {
                stop("Something seems to have gone wrong; I found ",
                     length(TY_lines), " lines that start a RIS record ",
                     "but ", length(ER_lines), " lines that end one.");
              }
              
              ### Get all indices in between
              recordlines <-
                do.call(
                  c,
                  mapply(
                    seq,
                    TY_lines,
                    ER_lines,
                    SIMPLIFY = FALSE
                  )
                );
              
              fileAsText <- fileAsText[recordlines];
              
            }
            
            writeLines(
              fileAsText,
              newFileName
            );
            
            cat("Importing file.\n");
            
            res <-
              #revtools::read_bibliography(
              synthesisr::read_refs(
                newFileName
              );

            res[, originFile_col] <-
              basename(currentFile);
            return(res);
          }
        )
      )
    }
  );
names(bibliographyHits) <- names(bibliographyHitFiles);

### Process all lists of dataframes in each directory

bibHitDfs <-
  lapply(
    bibliographyHits,
    metabefor::rbind_df_list
  );
names(bibHitDfs) <- names(bibliographyHits);

###-----------------------------------------------------------------------------
### Set the DOIs
###-----------------------------------------------------------------------------

for (i in names(bibHitDfs)) {
  if (!('doi' %in% names(bibHitDfs[[i]]))) {
    bibHitDfs[[i]]$doi <- NA;
  }
  ### Extract DOIs from 'article_id' column
  if ('article_id' %in% names(bibHitDfs[[i]])) {
    bibHitDfs[[i]]$doi <-
      ifelse(
        is.na(bibHitDfs[[i]]$doi),
        metabefor::extract_doi(
          bibHitDfs[[i]]$article_id
        ),
        bibHitDfs[[i]]$doi
      );
  }
  ### Potentially add from 'DO' column
  if ('DO' %in% names(bibHitDfs[[i]])) {
    bibHitDfs[[i]]$doi <-
      ifelse(
        is.na(bibHitDfs[[i]]$doi),
        bibHitDfs[[i]]$DO,
        bibHitDfs[[i]]$doi
      );
  }
}

###-----------------------------------------------------------------------------
### Internal deduplication
###-----------------------------------------------------------------------------

detailedDuplicateInfo <- list();
for (i in names(bibHitDfs)) {
  
  cat("\n\n**Processing ", i, "**\n\n", sep="");
  
  detailedDuplicateInfo[[i]] <-
    list(
      internal_res = metabefor::duplicate_sources(
        bibHitDfs[[i]],
        silent = FALSE,
        returnRawStringDistances = TRUE
      )
    );
      
  detailedDuplicateInfo[[i]]$details_internal <-
    attr(detailedDuplicateInfo[[i]]$internal_res, "duplicateInfo");
  
  bibHitDfs[[i]]$internalDuplicates <-
    detailedDuplicateInfo[[i]]$internal_res;

}

###-----------------------------------------------------------------------------
### Add unique identifiers to every record for future reference
###-----------------------------------------------------------------------------

### Create quasi-unique record identifiers (qurids) for all records;
### always starting from the same date.

recordsPerSearchBatch <-
  unlist(
    lapply(
      bibHitDfs,
      nrow
    )
  );

totalRecords <- sum(recordsPerSearchBatch);

recordsPerSearchBatch_lastIndices <-
  cumsum(recordsPerSearchBatch);

recordsPerSearchBatch_firstIndices <-
  c(1, head(recordsPerSearchBatch_lastIndices, -1) + 1);

quridIndices <-
  mapply(
    c,
    recordsPerSearchBatch_firstIndices,
    recordsPerSearchBatch_lastIndices,
    SIMPLIFY = FALSE
  );

names(quridIndices) <- names(recordsPerSearchBatch);

### Generate the required number of QURIDs

totalQURIDs <-
  metabefor::generate_qurids(
      totalRecords,
      origin=as.POSIXct("2020-11-19 18:00:00")
    );

for (i in names(bibHitDfs)) {

  cat("\n\n**Generating QURIDS for ", i, "**\n\n", sep="");
  
  if (is.null(bibHitDfs[[i]]$qurid)) {
    bibHitDfs[[i]]$qurid <-
      totalQURIDs[quridIndices[[i]][1]:quridIndices[[i]][2]];
  } else {
    bibHitDfs[[i]]$qurid <-
      ifelse(is.na(bibHitDfs[[i]]$qurid) |
               (nchar(bibHitDfs[[i]]$qurid) == 0),
             totalQURIDs[quridIndices[[i]][1]:quridIndices[[i]][2]],
             bibHitDfs[[i]]$qurid);
  }

  ### Also rename "ID" field to something else; this is a reserved
  ### field name for JabRef (at least for version 2.11, which we'll
  ### use for the screening)
  if ("ID" %in% names(bibHitDfs[[i]])) {
    names(bibHitDfs[[i]])[names(bibHitDfs[[i]]) == "ID"] <-
      "old_ID_field";
  }

}

ufs::cat(
  "A total of ",
  nrow(bibHitDfs[[1]]),
  " bibliographic entries have been read from ",
  length(unique(bibHitDfs[[1]][, originFile_col])),
  " files.",
  "\n\n",
  "Out of those ",
  nrow(bibHitDfs[[1]]),
  " entries, a DOI is available for ",
  sum(!is.na(bibHitDfs[[1]]$doi)),
  " entries (so ",
  sum(is.na(bibHitDfs[[1]]$doi)),
  " do not have a DOI)."
);

First search batch

Writing screener packages

###-----------------------------------------------------------------------------
### Process first search batch
### Note that these are sorted by batch
###-----------------------------------------------------------------------------

screenerPackages <-
  metabefor::write_screenerPackage(
    bibliographyDf = bibHitDfs[[1]],
    outputPath = screeningPath,
    basename = "stage1_",
    duplicateField = "duplicate"
  );

### Potentially, to screen with revtools:
# revtools::screen_titles(bibHitDf[[1]]);

Writing screener spreadsheet stage 1

This spreadsheet shows an overview of the screened articles

filesToRead_firstBatch <-
  list.files(
    screeningPath,
    pattern = "^tiab_stage1_.*\\.bib$",
    recursive = TRUE,
    full.names = TRUE
  );

screened_firstBatch_screenerA <-
  synthesisr::read_refs(
    filesToRead_firstBatch[1]
  );

screened_firstBatch_screenerB <-
  synthesisr::read_refs(
    filesToRead_firstBatch[2]
  );

screened_firstBatch_merged <-
  merge(
    screened_firstBatch_screenerA[
      ,
      grep("screenerbstatus",
           names(screened_firstBatch_screenerA),
           invert = TRUE)],
    screened_firstBatch_screenerB[
      ,
      c("qurid", "screenerbstatus1")
    ],
    by = "qurid",
    all = TRUE
  );

screened_firstBatch_merged$included <-
  ifelse(grepl("incl", screened_firstBatch_merged$screenerastatus, ignore.case=TRUE) |
           grepl("incl", screened_firstBatch_merged$screenerbstatus1, ignore.case=TRUE),
         "incl",
         "");

screened_firstBatch_merged$screening_stage1 <-
  paste0(
    screened_firstBatch_merged$screenerastatus,
    "|",
    screened_firstBatch_merged$screenerbstatus1,
    ">",
    screened_firstBatch_merged$included
  );

screened_firstBatch_merged$included_by_B_not_by_A <-
  !grepl("inc", screened_firstBatch_merged$screenerastatus,
         ignore.case = TRUE) &
  grepl("inc", screened_firstBatch_merged$screenerbstatus1,
        ignore.case = TRUE);

screened_firstBatch_merged[,
  setdiff(
    extraction_administration_columns_firstBatch,
    names(screened_firstBatch_merged)
  )] <- "";

openxlsx::write.xlsx(
  screened_firstBatch_merged[
    screened_firstBatch_merged$included == "incl",
    extraction_administration_columns_firstBatch],
  file = file.path(screeningPath,
                   "screening_overview--stage1--generated.xlsx")
);

Second search batch

Deduplication compared to first batch

###-----------------------------------------------------------------------------
### Process second search batch
###-----------------------------------------------------------------------------

secondBatchDuplicates <-
  metabefor::duplicate_sources(
    screened_firstBatch,
    bibHitDfs[[2]],
    silent = FALSE
  );

secondBatchDuplicateInfo <-
  attr(secondBatchDuplicates, "duplicateInfo");

bibHitDfs[[2]]$duplicate <-
  secondBatchDuplicates;

Overview of duplicates

Merging 2nd and 1st search hits and write screener package

###-----------------------------------------------------------------------------
### Merge search hits and write screener package
###-----------------------------------------------------------------------------

round2_screening <-
  metabefor::rbind_dfs(
    screened_firstBatch,
    bibHitDfs[[2]]
  );

hasNoContents <- function(x) {
  isNA <- is.na(x);
  hasChars <- nchar(x) > 0;
  return(ifelse(isNA, TRUE, !hasChars));
}

hasContents <- function(x) {
  isNA <- is.na(x);
  hasChars <- nchar(x) > 0;
  return(ifelse(isNA, FALSE, hasChars));
}

round2_screening$to_be_screened <-
  hasNoContents(round2_screening$duplicate) &
  hasNoContents(round2_screening$screening_stage1);

### Strip duplicates and records that were already screened in round 1
round2_screening_onlyNew <-
  round2_screening[
    round2_screening$to_be_screened,
  ]

screenerPackages_round2 <-
  metabefor::write_screenerPackage(
    bibliographyDf = round2_screening_onlyNew,
    screeners = c("a", "b"),
    screenerFieldsPrefix = "screener_",
    screenerFieldsSuffix = "_status_r2",
    outputPath = screeningPath,
    prevRoundField = "screening_stage1",
    basename = "stage2_",
    duplicateField = "duplicate",
    silent=FALSE
  );

### Test
# test_screening2_pkgs <-
#   synthesisr::read_refs(
#     file.path(
#       screeningPath,
#       "stage2_a",
#       "stage2_a.bib"
#     )
#   );

ufs::cat0(
  "After the first round of screening, and after eliminating duplicate records, there are ",
  nrow(round2_screening_onlyNew),
  " records left to screen for the second search."
);

Writing screener spreadsheet stage 2

This spreadsheet shows an overview of the screened articles.

filesToRead_secondBatch <-
  list.files(
    screeningPath,
    pattern = "^tiab_stage2_.*\\.bib$",
    recursive = TRUE,
    full.names = TRUE
  );

screened_secondBatch_screenerA <-
  synthesisr::read_refs(
    filesToRead_secondBatch[1]
  );

screened_secondBatch_screenerB <-
  synthesisr::read_refs(
    filesToRead_secondBatch[2]
  );

### Adding screening judgement of screener B to file of first screener
screened_secondBatch_merged <-
  merge(
    screened_secondBatch_screenerA,
    screened_secondBatch_screenerB[ , c("qurid", "screenerbstatus1")],
    by = "qurid",
    all = TRUE
  );

screened_secondBatch_merged$included <-
  ifelse(grepl("incl", screened_secondBatch_merged$screener_a_status_r2,
               ignore.case=TRUE) |
           grepl("incl", screened_secondBatch_merged$screenerbstatus1,
                 ignore.case=TRUE),
         "incl",
         "");

screened_secondBatch_merged$screening_stage2 <-
  paste0(
    screened_secondBatch_merged$screener_a_status_r2,
    "|",
    screened_secondBatch_merged$screenerbstatus1,
    ">",
    screened_secondBatch_merged$included
  );

screened_secondBatch_merged$included_by_B_not_by_A <-
  !grepl("inc", screened_secondBatch_merged$screener_a_status_r2,
         ignore.case = TRUE) &
  grepl("inc", screened_secondBatch_merged$screenerbstatus1,
        ignore.case = TRUE);

screened_secondBatch_merged[,
  setdiff(
    extraction_administration_columns_secondBatch,
    names(screened_secondBatch_merged)
  )] <- "";

openxlsx::write.xlsx(
  screened_secondBatch_merged[
    screened_secondBatch_merged$included == "incl",
    extraction_administration_columns_secondBatch
  ],
  file = file.path(screeningPath,
                   "screening_overview--stage2--generated.xlsx")
);

Extraction

Entities to extract

This is an overview of the hierarchy of entities to extract.

print(fullObject$rxsStructure$parsedEntities$extractionScriptTree);

DiagrammeR::render_graph(fullObject$rxsTreeDiagram_simple)

DiagrammeR::export_graph(
  fullObject$rxsTreeDiagram_simple,
  file.path(
    outputPath,
    "extraction-tree--simple.pdf"
  )
);

cat(fullObject$rxsInstructions);

cat(fullObject$entityOverview_list);

Extraction script template

This is the extraction script generated based on the extraction script specification.

cat("\n\n<pre><textarea rows='40' cols='124' style='font-family:monospace;font-size:11px;white-space:pre;'>",
    unlist(fullObject$rxsTemplate),
    "</textarea></pre>\n\n",
    sep="\n");

Analyses

Extraction scripts