Attachment 'comments.R'
Download 1 # Extract comments and target texts from iWorks Pages documents
2 # Prerequiste: exported as iWorks Pages'09 file "document.09.pages"
3 #
4 # Ian Riley <ian@riley.asia>
5 # 10 April 2019
6
7 library("xml2")
8
9 # stackoverflow.com/questions/5060076 answer by Stibu
10 unescape_html2 <- function(x) {
11 html <- paste0("<x>", paste0(x, collapse = "<>"), "</x>")
12 unescaped <- xml2::xml_text(xml2::read_html(html))
13 strsplit(unescaped, "<>", fixed = TRUE)[[1]]
14 }
15
16 # get named group matches from regexpr()
17 get.group <- function(text, m, group = "text") {
18 start <- attr(m, "capture.start")[, group]
19 end <- start + attr(m,"capture.length")[, group] - 1
20 matches <- substr(text, start, end)
21 matches[nzchar(matches)]
22 }
23
24 # for indented, multiline string assignment
25 s <- function(s) paste0(trimws(unlist(strsplit(s, "\n"))), collapse = "")
26
27 pagesFile <- "document.09.pages"
28 payload <- "index.xml"
29 delivery <- "comments.csv"
30
31 pattern.targets <- s("
32 (?:<sf:annotation sf:target=\")
33 (?<text>.*?)
34 (?:\")
35 ")
36 pattern.ids <- s("
37 (?:<sf:text sfa:ID=\")
38 (?<text>.*?)
39 (?:\")
40 ")
41 pattern.comments <- s("
42 (?:^<sf:text-body.*?>)
43 (?:<sf:p.*?>)
44 (?<text>.*?)
45 (?=</sf:p>)
46 ")
47 pattern.texts <- s("
48 (?:<sf:annotation-field sfa:ranged=\"true\" sfa:ID=\")
49 (?<tag>sf:annotation-field-.*?)
50 (?:\">)
51 (?<text>.*?)
52 (?:</sf:annotation-field>)
53 ")
54 pattern.xtexts <- s("
55 (?:<sf:annotation-field-ref sfa:IDREF=\")
56 (?<tag>sf:annotation-field-.*?)
57 (?:\">)
58 (?<text>.*?)
59 (?:</sf:annotation-field-ref>)
60 ")
61
62 # extract the payload
63
64 unzip(pagesFile, payload)
65 text = readLines(payload)
66
67 # get annotations node
68
69 start <- grep("<sf:annotations>", text, fixed = TRUE)
70 end <- grep("</sf:annotations>", text, fixed = TRUE)
71
72 # nothing to get
73 if (length(start) == 0) stop("No comments found", call. = FALSE)
74
75 notes <- text[start:end]
76
77 # get comments and their tags
78
79 m <- regexpr(pattern.targets, notes, perl = TRUE)
80 targets <- get.group(notes, m)
81
82 m <- regexpr(pattern.ids, notes, perl = TRUE)
83 ids <- get.group(notes, m)
84
85 m <- regexpr(pattern.comments, notes, perl = TRUE)
86 comments <- get.group(notes, m)
87 comments <- gsub("<sf:.*?>", "", comments) # clean comments
88 comments <- unescape_html2(comments)
89 names(comments) <- targets
90
91 # get target texts
92
93 m <- gregexpr(pattern.texts, text, perl = TRUE)
94 subtext <- unlist(regmatches(text, m))
95 m <- regexpr(pattern.texts, subtext, perl = TRUE)
96 texts <- get.group(subtext, m)
97 tags <- get.group(subtext, m, group = "tag")
98 names(texts) <- tags
99
100 # get extra texts when the target is formatted text
101
102 m <- gregexpr(pattern.xtexts, text, perl = TRUE)
103 subtext <- unlist(regmatches(text, m))
104 m <- regexpr(pattern.xtexts, subtext, perl = TRUE)
105 xtexts <- get.group(subtext, m)
106 if (length(xtexts) != 0) {
107 tags <- get.group(subtext, m, group = "tag")
108 xtexts <- sapply(unique(tags), function(t) paste0(xtexts[tags == t], collapse = ""))
109 texts[names(xtexts)] <- paste0(texts[names(xtexts)], xtexts[names(xtexts)])
110 }
111
112 texts <- gsub("<sf:br/>", "<br>", texts) # indicate paragraph breaks
113 texts <- gsub("<sf:.*?>", "", texts) # clean texts
114
115 target.texts <- character()
116 target.texts[targets] <- texts[targets] # align to comment order
117
118 remove(text) # remove risk of printing interactively; its too big
119
120 commentDF <- data.frame(targets, ids, target.texts, comments,
121 row.names = seq_along(targets),
122 stringsAsFactors = FALSE)
123
124 write.csv(commentDF, delivery, row.names = seq_along(rownames(commentDF)))
You are not allowed to attach a file to this page.