Merge lp:~henrik-lochmann/goobi-presentation/bug-878893 into lp:~slub.team/goobi-presentation/old-bzr-trunk

Proposed by Henrik Lochmann
Status: Needs review
Proposed branch: lp:~henrik-lochmann/goobi-presentation/bug-878893
Merge into: lp:~slub.team/goobi-presentation/old-bzr-trunk
Diff against target: 317 lines (+202/-3)
8 files modified
dlf/common/class.tx_dlf_document.php (+152/-1)
dlf/common/class.tx_dlf_indexing.php (+3/-0)
dlf/ext_conf_template.txt (+3/-0)
dlf/locallang.xml (+2/-0)
dlf/plugins/pageview/class.tx_dlf_pageview.php (+22/-2)
dlf/plugins/pageview/flexform.xml (+15/-0)
dlf/plugins/pageview/locallang.xml (+2/-0)
dlf/plugins/pageview/template.tmpl (+3/-0)
To merge this branch: bzr merge lp:~henrik-lochmann/goobi-presentation/bug-878893
Reviewer Review Type Date Requested Status
Sebastian Meyer Pending
Review via email: mp+134944@code.launchpad.net
To post a comment you must log in.

Unmerged revisions

137. By Henrik Lochmann

- pageview plugin got a template with ###TEXT### marker and detects whether linked files are images or OCR files

136. By Henrik Lochmann

- indexes plain text for document pages
ATTENTION: this requires the new lucene field in schema.xml of solr: '<field name="text" type="standard" indexed="true" stored="true" multiValued="false" />'; provided on demand

135. By Henrik Lochmann

- new method getText(..) and new field $text, to retrieve and cache plain text information from linked OCR files (currently supported: ALTO and plain text)

134. By Henrik Lochmann

- configuration of OCR file groups in extension settings

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'dlf/common/class.tx_dlf_document.php'
2--- dlf/common/class.tx_dlf_document.php 2012-10-02 12:30:10 +0000
3+++ dlf/common/class.tx_dlf_document.php 2012-11-19 15:34:20 +0000
4@@ -265,6 +265,14 @@
5 protected $tableOfContentsLoaded = FALSE;
6
7 /**
8+ * This holds plain text, derived from linked OCR files.
9+ *
10+ * @var array
11+ * @access protected
12+ */
13+ protected $text = array();
14+
15+ /**
16 * This holds the document's thumbnail location.
17 *
18 * @var string
19@@ -1879,7 +1887,150 @@
20 return $this->smLinks;
21
22 }
23-
24+
25+ /**
26+ * This returns the passed page's plain text, in case any OCR information is stored in the underlying document.
27+ *
28+ * @access protected
29+ *
30+ * @param integer $page: The number of the page, plain text is requested for
31+ *
32+ * @return string The passed page's plain text
33+ */
34+ public function getText($page) {
35+
36+ // Validate passed value.
37+ if (!isset($page) || !t3lib_div::testInt($page)) {
38+
39+ return '';
40+
41+ }
42+
43+ if ($this->text[$page] === NULL) {
44+
45+ $extConf = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf'][$this->extKey]);
46+
47+ $ocrGrps = t3lib_div::trimExplode(',', $extConf['fileGrpsOcr']);
48+
49+ foreach ($ocrGrps as $ocrGrp) {
50+
51+ // Get file ID for current group.
52+ $textFileId = $this->physicalPagesInfo[$this->physicalPages[$page]]['files'][strtolower($ocrGrp)];
53+
54+ if (empty($textFileId)) {
55+
56+ if (TYPO3_DLOG) {
57+
58+ t3lib_div::devLog('[tx_dlf_document->getText('.$page.')] No text information found for METS fileGrp "'.$ocrGrp.'"', $this->extKey, t3lib_div::SYSLOG_SEVERITY_INFO);
59+
60+ }
61+
62+ continue;
63+
64+ }
65+
66+ // Get file URL for extracted file ID.
67+ $textFile = $this->getFileLocation($textFileId);
68+
69+ // Extract content depending on text file format.
70+ $content = NULL;
71+
72+ switch ($ocrGrp) {
73+
74+ // For ALTO groups, we load the ALTO XML file, get string nodes and append their CONTENT attributes to the resulting text.
75+ case 'ALTO':
76+
77+ // Turn off libxml's error logging.
78+ $libxmlErrors = libxml_use_internal_errors(TRUE);
79+
80+ // Load XML from file.
81+ $xml = @simplexml_load_file($textFile);
82+
83+ // Reset libxml's error logging.
84+ libxml_use_internal_errors($libxmlErrors);
85+
86+ if ($xml !== FALSE) {
87+
88+ $xml->registerXPathNamespace('alto', 'http://www.loc.gov/standards/alto/');
89+
90+ $altoContentNodes = $xml->xpath('//String/@CONTENT');
91+
92+ // If we did not get a result w/o namespaces prefix, we try again with namespace prefix.
93+ if (empty($altoContentNodes)) {
94+
95+ $altoContentNodes = $xml->xpath('//alto:String/@alto:CONTENT');
96+
97+ }
98+
99+ // Iterate content nodes and concatenate content to result.
100+ if (!empty($altoContentNodes)) {
101+
102+ $content = '';
103+
104+ foreach($altoContentNodes as $string) {
105+
106+ $content .= ' '.(string) $string;
107+
108+ }
109+
110+ }
111+
112+ } else {
113+
114+ if (TYPO3_DLOG) {
115+
116+ t3lib_div::devLog('[tx_dlf_document->getText('.$page.')] Could not load XML file from "'.$textFile.'"', $this->extKey, t3lib_div::SYSLOG_SEVERITY_ERROR);
117+
118+ }
119+
120+ }
121+
122+ break;
123+
124+ case 'TXT':
125+
126+ $content = file_get_contents($textFile);
127+
128+ if (!$content) {
129+
130+ $content = NULL;
131+
132+ if (TYPO3_DLOG) {
133+
134+ t3lib_div::devLog('[tx_dlf_document->getText('.$page.')] Could not load plain text file from "'.$textFile.'"', $this->extKey, t3lib_div::SYSLOG_SEVERITY_ERROR);
135+
136+ }
137+
138+ }
139+
140+ break;
141+
142+ }
143+
144+ // If current OCR group brought a result, end iterating OCR groups and set result.
145+ if ($content !== NULL) {
146+
147+ $this->text[$page] = $content;
148+
149+ break;
150+
151+ }
152+
153+ }
154+
155+ }
156+
157+ // Remark text of page loaded.
158+ if ($this->text[$page] === NULL) {
159+
160+ $this->text[$page] = '';
161+
162+ }
163+
164+ return $this->text[$page];
165+
166+ }
167+
168 /**
169 * This returns the document's thumbnail location
170 *
171
172=== modified file 'dlf/common/class.tx_dlf_indexing.php'
173--- dlf/common/class.tx_dlf_indexing.php 2012-10-02 07:02:46 +0000
174+++ dlf/common/class.tx_dlf_indexing.php 2012-11-19 15:34:20 +0000
175@@ -547,6 +547,9 @@
176
177 $solrDoc->setField('page', $logicalUnit['points']);
178
179+ // Extract full text for entire document and add to solrDoc.
180+ $solrDoc->setField('text', $doc->getText(intval($logicalUnit['points'])));
181+
182 }
183
184 if ($logicalUnit['id'] == $doc->toplevelId) {
185
186=== modified file 'dlf/ext_conf_template.txt'
187--- dlf/ext_conf_template.txt 2012-09-13 17:53:13 +0000
188+++ dlf/ext_conf_template.txt 2012-11-19 15:34:20 +0000
189@@ -22,6 +22,9 @@
190 # cat=Files; type=string; label=LLL:EXT:dlf/locallang.xml:config.fileGrpThumbs
191 fileGrpThumbs = THUMBS
192
193+# cat=Files; type=string; label=LLL:EXT:dlf/locallang.xml:config.fileGrpsOcr
194+fileGrpsOcr = ALTO,TXT
195+
196 # cat=Solr; type=user[EXT:dlf/hooks/class.tx_dlf_em.php:tx_dlf_em->checkSolrConnection]; label=LLL:EXT:dlf/locallang.xml:config.solrConnect
197 solrConnect = 0
198
199
200=== modified file 'dlf/locallang.xml'
201--- dlf/locallang.xml 2012-10-02 07:02:46 +0000
202+++ dlf/locallang.xml 2012-11-19 15:34:20 +0000
203@@ -130,6 +130,7 @@
204 <label index="config.unhideOnIndex">Unhide indexed documents?: Should hidden documents be unhidden when re-indexing them? (default is "FALSE")</label>
205 <label index="config.fileGrps">METS fileGrps: comma-separated list of all @USE attribute values (default is "THUMBS,MIN,DEFAULT,MAX")</label>
206 <label index="config.fileGrpThumbs">Thumbnail fileGrp: @USE attribute value (default is "THUMBS")</label>
207+ <label index="config.fileGrpsOcr">OCR fileGrp: comma-separated list of all @USE attribute values for OCR processing, prioritized in order (default is "ALTO,TXT")</label>
208 <label index="config.solrConnect">Solr Connection</label>
209 <label index="config.solrHost">Solr Server Host: (default is "localhost")</label>
210 <label index="config.solrPort">Solr Server Port: (default is "8180")</label>
211@@ -292,6 +293,7 @@
212 <label index="config.unhideOnIndex">Indexierte Dokumente einblenden?: Sollen ausgeblendete Dokumente bei der erneuten Indexierung wieder eingeblendet werden? (Standard ist "FALSE")</label>
213 <label index="config.fileGrps">METS fileGrps: Komma-getrennte Liste aller @USE Attributwerte (Standard ist "THUMBS,MIN,DEFAULT,MAX")</label>
214 <label index="config.fileGrpThumbs">Vorschau fileGrp: @USE Attributwert der Vorschaubilder (Standard ist "THUMBS")</label>
215+ <label index="config.fileGrpsOcr">OCR fileGrp: Komma-getrennte Liste aller @USE Attributwerte für die OCR Verarbeitung, priorisiert nach Reihenfolge (Standard ist "ALTO,TXT")</label>
216 <label index="config.solrConnect">Solr Verbindung</label>
217 <label index="config.solrHost">Solr Server Host: (Standard ist "localhost")</label>
218 <label index="config.solrPort">Solr Server Port: (Standard ist "8180")</label>
219
220=== modified file 'dlf/plugins/pageview/class.tx_dlf_pageview.php'
221--- dlf/plugins/pageview/class.tx_dlf_pageview.php 2012-09-14 16:54:56 +0000
222+++ dlf/plugins/pageview/class.tx_dlf_pageview.php 2012-11-19 15:34:20 +0000
223@@ -108,10 +108,17 @@
224 $fileGrpUrl = $this->doc->getFileLocation($this->doc->physicalPagesInfo[$this->doc->physicalPages[$page]]['files'][$fileGrp]);
225
226 // Check file's existence.
227- $headers = @get_headers($fileGrpUrl);
228+ $headers = @get_headers($fileGrpUrl, 1);
229
230 if (is_array($headers) && preg_match('/^HTTP\\/\\d+\\.\\d+\\s+2\\d\\d\\s+.*$/', $headers[0])) {
231
232+ // skip non-image files
233+ if (!strstr($headers['Content-Type'], 'image/')) {
234+
235+ continue;
236+
237+ }
238+
239 $fileGrpSize = getimagesize($fileGrpUrl);
240
241 $imageData[] = array (
242@@ -170,8 +177,21 @@
243
244 }
245
246+ // Load template file.
247+ if (!empty($this->conf['templateFile'])) {
248+
249+ $this->template = $this->cObj->getSubpart($this->cObj->fileResource($this->conf['templateFile']), '###TEMPLATE###');
250+
251+ } else {
252+
253+ $this->template = $this->cObj->getSubpart($this->cObj->fileResource('EXT:dlf/plugins/pageview/template.tmpl'), '###TEMPLATE###');
254+
255+ }
256+
257+ $content = $this->cObj->substituteMarkerArray($this->template, array( '###TEXT###' => $this->doc->getText($this->piVars['page']) ));
258+
259 $content .= $this->showViewer();
260-
261+
262 return $this->pi_wrapInBaseClass($content);
263
264 }
265
266=== modified file 'dlf/plugins/pageview/flexform.xml'
267--- dlf/plugins/pageview/flexform.xml 2011-03-09 15:36:27 +0000
268+++ dlf/plugins/pageview/flexform.xml 2012-11-19 15:34:20 +0000
269@@ -35,6 +35,21 @@
270 </config>
271 </TCEforms>
272 </excludeOther>
273+ <templateFile>
274+ <TCEforms>
275+ <exclude>1</exclude>
276+ <label>LLL:EXT:dlf/plugins/pageview/locallang.xml:tt_content.pi_flexform.templateFile</label>
277+ <config>
278+ <type>group</type>
279+ <internal_type>file_reference</internal_type>
280+ <allowed>tmpl,tpl,html,htm,txt</allowed>
281+ <size>1</size>
282+ <maxitems>1</maxitems>
283+ <minitems>0</minitems>
284+ <disable_controls>upload</disable_controls>
285+ </config>
286+ </TCEforms>
287+ </templateFile>
288 </el>
289 </ROOT>
290 </sDEF>
291
292=== modified file 'dlf/plugins/pageview/locallang.xml'
293--- dlf/plugins/pageview/locallang.xml 2011-03-09 15:36:27 +0000
294+++ dlf/plugins/pageview/locallang.xml 2012-11-19 15:34:20 +0000
295@@ -8,10 +8,12 @@
296 <languageKey index="default" type="array">
297 <label index="tt_content.pi_flexform.sheet_general">Options</label>
298 <label index="tt_content.pi_flexform.excludeOther">Show only documents from the selected page</label>
299+ <label index="tt_content.pi_flexform.templateFile">Template file</label>
300 </languageKey>
301 <languageKey index="de" type="array">
302 <label index="tt_content.pi_flexform.sheet_general">Einstellungen</label>
303 <label index="tt_content.pi_flexform.excludeOther">Nur Dokumente der ausgewählten Seite anzeigen</label>
304+ <label index="tt_content.pi_flexform.templateFile">HTML-Template</label>
305 </languageKey>
306 </data>
307 </T3locallang>
308\ No newline at end of file
309
310=== added file 'dlf/plugins/pageview/template.tmpl'
311--- dlf/plugins/pageview/template.tmpl 1970-01-01 00:00:00 +0000
312+++ dlf/plugins/pageview/template.tmpl 2012-11-19 15:34:20 +0000
313@@ -0,0 +1,3 @@
314+<!-- ###TEMPLATE### -->
315+<p>###TEXT###</p>
316+<!-- ###TEMPLATE### -->
317\ No newline at end of file

Subscribers

People subscribed via source and target branches