Attachment 'comments.R'

Download

   1 # Extract comments and target texts from iWorks Pages documents
   2 # Prerequiste: exported as iWorks Pages'09 file "document.09.pages"
   3 # 
   4 # Ian Riley <ian@riley.asia>
   5 # 10 April 2019
   6 
   7 library("xml2")
   8 
   9 # stackoverflow.com/questions/5060076 answer by Stibu
  10 unescape_html2 <- function(x) {
  11     html <- paste0("<x>", paste0(x, collapse = "<>"), "</x>")
  12     unescaped <- xml2::xml_text(xml2::read_html(html))
  13     strsplit(unescaped, "<>", fixed = TRUE)[[1]]
  14     }
  15     
  16 # get named group matches from regexpr()
  17 get.group <- function(text, m, group = "text") {
  18     start <- attr(m, "capture.start")[, group]
  19     end <- start + attr(m,"capture.length")[, group] - 1
  20     matches <- substr(text, start, end)
  21     matches[nzchar(matches)]
  22 	}
  23 	
  24 # for indented, multiline string assignment
  25 s <- function(s) paste0(trimws(unlist(strsplit(s, "\n"))), collapse = "")
  26 
  27 pagesFile <- "document.09.pages" 
  28 payload <- "index.xml"
  29 delivery <- "comments.csv"
  30 
  31 pattern.targets <- s("
  32     (?:<sf:annotation sf:target=\")
  33     (?<text>.*?)
  34     (?:\")
  35     ")
  36 pattern.ids <- s("
  37     (?:<sf:text sfa:ID=\")
  38     (?<text>.*?)
  39     (?:\")
  40     ")
  41 pattern.comments <- s("
  42     (?:^<sf:text-body.*?>)
  43     (?:<sf:p.*?>)
  44     (?<text>.*?)
  45     (?=</sf:p>)
  46     ")
  47 pattern.texts <- s("
  48     (?:<sf:annotation-field sfa:ranged=\"true\" sfa:ID=\")
  49     (?<tag>sf:annotation-field-.*?)
  50     (?:\">)
  51     (?<text>.*?)
  52     (?:</sf:annotation-field>)
  53     ")
  54 pattern.xtexts <- s("
  55     (?:<sf:annotation-field-ref sfa:IDREF=\")
  56     (?<tag>sf:annotation-field-.*?)
  57     (?:\">)
  58     (?<text>.*?)
  59     (?:</sf:annotation-field-ref>)
  60     ")
  61 
  62 # extract the payload
  63 
  64 unzip(pagesFile, payload)
  65 text = readLines(payload)
  66 
  67 # get annotations node
  68 
  69 start <- grep("<sf:annotations>", text, fixed = TRUE)
  70 end   <- grep("</sf:annotations>", text, fixed = TRUE)
  71 
  72 # nothing to get
  73 if (length(start) == 0) stop("No comments found",  call. = FALSE)
  74 
  75 notes <- text[start:end]
  76 
  77 # get comments and their tags
  78 
  79 m <- regexpr(pattern.targets, notes, perl = TRUE)
  80 targets <- get.group(notes, m)
  81 
  82 m <- regexpr(pattern.ids, notes, perl = TRUE)
  83 ids <- get.group(notes, m)
  84 
  85 m <- regexpr(pattern.comments, notes, perl = TRUE)
  86 comments <- get.group(notes, m)
  87 comments <- gsub("<sf:.*?>", "", comments) # clean comments
  88 comments <- unescape_html2(comments)
  89 names(comments) <- targets
  90 
  91 # get target texts
  92 
  93 m  <- gregexpr(pattern.texts, text, perl = TRUE)
  94 subtext <- unlist(regmatches(text, m))
  95 m <- regexpr(pattern.texts, subtext, perl = TRUE)
  96 texts <- get.group(subtext, m)
  97 tags <- get.group(subtext, m, group = "tag")
  98 names(texts) <- tags
  99 
 100 # get extra texts when the target is formatted text
 101 
 102 m  <- gregexpr(pattern.xtexts, text, perl = TRUE)
 103 subtext <- unlist(regmatches(text, m))
 104 m <- regexpr(pattern.xtexts, subtext, perl = TRUE)
 105 xtexts <- get.group(subtext, m)
 106 if (length(xtexts) != 0) {
 107     tags <- get.group(subtext, m, group = "tag") 
 108     xtexts <- sapply(unique(tags), function(t) paste0(xtexts[tags == t], collapse = ""))
 109     texts[names(xtexts)] <- paste0(texts[names(xtexts)], xtexts[names(xtexts)])
 110     }
 111 
 112 texts <- gsub("<sf:br/>", "<br>", texts) # indicate paragraph breaks
 113 texts <- gsub("<sf:.*?>", "", texts) # clean texts
 114 
 115 target.texts <- character()
 116 target.texts[targets] <- texts[targets] # align to comment order
 117     
 118 remove(text) # remove risk of printing interactively; its too big
 119 
 120 commentDF <- data.frame(targets, ids, target.texts, comments,
 121                         row.names = seq_along(targets), 
 122                         stringsAsFactors = FALSE)
 123 
 124 write.csv(commentDF, delivery, row.names = seq_along(rownames(commentDF)))

You are not allowed to attach a file to this page.