2 |
- |
1 |
% Copyright (C) 2002 artofcode LLC. All rights reserved.
|
|
|
2 |
%
|
|
|
3 |
% This software is provided AS-IS with no warranty, either express or
|
|
|
4 |
% implied.
|
|
|
5 |
%
|
|
|
6 |
% This software is distributed under license and may not be copied,
|
|
|
7 |
% modified or distributed except as expressly authorized under the terms
|
|
|
8 |
% of the license contained in the file LICENSE in this distribution.
|
|
|
9 |
%
|
|
|
10 |
% For more information about licensing, please refer to
|
|
|
11 |
% http://www.ghostscript.com/licensing/. For information on
|
|
|
12 |
% commercial licensing, go to http://www.artifex.com/licensing/ or
|
|
|
13 |
% contact Artifex Software, Inc., 101 Lucas Valley Road #110,
|
|
|
14 |
% San Rafael, CA 94903, U.S.A., +1(415)492-9861.
|
|
|
15 |
|
|
|
16 |
% $Id: pdf_rbld.ps,v 1.8 2005/02/07 06:38:02 dan Exp $
|
|
|
17 |
% pdf_rbld.ps - Rebuilding of broken PDF files (xref errors)
|
|
|
18 |
|
|
|
19 |
% This module contains routines that are used if we detect an error
|
|
|
20 |
% while reading the xref tables. These routines will scan the file and
|
|
|
21 |
% build an xref table by finding the objects. We also need to find the
|
|
|
22 |
% appropriate trailer dictionary. Note: One procedure is also used
|
|
|
23 |
% even if we do not need to rebuild a PDF file.
|
|
|
24 |
%
|
|
|
25 |
% This module cannot rebuild a PDF file which has had errors created inside
|
|
|
26 |
% of objects or binary data streams. It often succeeds with files that
|
|
|
27 |
% have had its end of lines converted between unix and dos versions.
|
|
|
28 |
|
|
|
29 |
% if true --> we have an object with duplicate object and generation numbers.
|
|
|
30 |
/dup_obj_gen_num false def
|
|
|
31 |
|
|
|
32 |
% Note: This routine is also used by non-rebuild code.
|
|
|
33 |
% Store a line in the xref array (Actually Objects and Generations arrays)
|
|
|
34 |
% <obj num> (strm num> <obj loc> <gen num> setxrefentry <obj num> strm num>
|
|
|
35 |
% <obj loc> <gen num>
|
|
|
36 |
/setxrefentry
|
|
|
37 |
{ % We store generation numbers as value + 1
|
|
|
38 |
% We reserve 0 to indicate an free xref entry
|
|
|
39 |
1 add % increment generation number
|
|
|
40 |
% To save space, generations numbers are stored in a lstring unless we
|
|
|
41 |
% find a generation number greater than 255. If so then transfer to
|
|
|
42 |
% an larray.
|
|
|
43 |
dup 255 gt {
|
|
|
44 |
Generations ltype /stringtype eq { % Convert Generations to an larray.
|
|
|
45 |
larray Generations llength lgrowto dup % Create new larray
|
|
|
46 |
|
|
|
47 |
Generations 1 index lget lput dup
|
|
|
48 |
} for
|
|
|
49 |
pop
|
|
|
50 |
/Generations exch store % Save new Generations larray
|
|
|
51 |
} if
|
|
|
52 |
} if
|
|
|
53 |
% Verify that the new values are for a new object. If the current
|
|
|
54 |
% entry is null then we have a new entry.
|
|
|
55 |
Objects 4 index lget null eq {
|
|
|
56 |
ObjectStream 4 index 4 index cvx lput % Save ObjectStream object number
|
|
|
57 |
Objects 4 index 3 index cvx lput % Save object location
|
|
|
58 |
Generations 4 index 2 index lput % Save geenration number
|
|
|
59 |
} {
|
|
|
60 |
% Verify that the new entry has at least as high a generaton number
|
|
|
61 |
% We accept equal entry number because we have found PDF files in
|
|
|
62 |
% which there are multiple objects with the same object and entry
|
|
|
63 |
% numbers. The normal xref logic only accepts the first such
|
|
|
64 |
% entry that it finds. However the 'rebuild PDF' logic can find
|
|
|
65 |
% both such entries. The correct one is usually the last one.
|
|
|
66 |
Generations 4 index lget 1 index le {
|
|
|
67 |
ObjectStream 4 index 4 index cvx lput % Save ObjectStream object number
|
|
|
68 |
Objects 4 index 3 index cvx lput % Save object location
|
|
|
69 |
Generations 4 index 2 index lput % Save geenration number
|
|
|
70 |
} if
|
|
|
71 |
% Set error flag if we have equal object and generation numbers
|
|
|
72 |
Generations 4 index lget 1 index eq { /dup_obj_gen_num true def } if
|
|
|
73 |
} ifelse
|
|
|
74 |
} bind def
|
|
|
75 |
|
|
|
76 |
% Print the contents of the xref array. This actually consists of two
|
|
|
77 |
% arrays (Objects and Generations). Both are larrays. larrays are a
|
|
|
78 |
% special Ghostscript object which can be arrays with more than 64k
|
|
|
79 |
% elements.
|
|
|
80 |
/print_xref % - print_xref -
|
|
|
81 |
{ 0 1 Objects llength 1 sub % stack: 0 1 <number of objects - 1>
|
|
|
82 |
{ dup =only % print object number
|
|
|
83 |
( ) print
|
|
|
84 |
dup Generations exch lget 1 sub =only % print Generation number
|
|
|
85 |
( ) print
|
|
|
86 |
dup ObjectStream exch lget ==only % print ObjectStream object number
|
|
|
87 |
( ) print
|
|
|
88 |
Objects exch lget === % print object location
|
|
|
89 |
} for
|
|
|
90 |
flush
|
|
|
91 |
} bind def
|
|
|
92 |
|
|
|
93 |
% This is the same as the postscript token operator except that
|
|
|
94 |
% errors are ignored.
|
|
|
95 |
/token_nofail
|
|
|
96 |
{
|
|
|
97 |
{ token } .internalstopped
|
|
|
98 |
{ pop false } if
|
|
|
99 |
} bind odef
|
|
|
100 |
|
|
|
101 |
% Get token from string and check its type
|
|
|
102 |
% <string> <type> typed_token <false> % no token or not match
|
|
|
103 |
% <string> <type> typed_token <obj> <last> <true> % matching token type
|
|
|
104 |
% Where last is the string remainder
|
|
|
105 |
/typed_token
|
|
|
106 |
{ exch
|
|
|
107 |
token_nofail % get token
|
|
|
108 |
{
|
|
|
109 |
dup type % stack: type last token type
|
|
|
110 |
4 -1 roll eq { % stack: last token bool
|
|
|
111 |
exch true % desired object found - set exit status
|
|
|
112 |
} {
|
|
|
113 |
pop pop false % not type - clear stack, set exit status
|
|
|
114 |
} ifelse
|
|
|
115 |
} {
|
|
|
116 |
pop false % no token - pop type, set exit status
|
|
|
117 |
} ifelse % check if we got token
|
|
|
118 |
} bind def
|
|
|
119 |
|
|
|
120 |
% Allocate space for post_eof_count to be bound into procedures below.
|
|
|
121 |
/post_eof_count 0 def
|
|
|
122 |
|
|
|
123 |
% We want the location of the trailer dictionary at the start of file.
|
|
|
124 |
% First we will find the xref. Then we will skip over the xref entries
|
|
|
125 |
% to the trailer.
|
|
|
126 |
/search_start_trailer % - search_start_trailer <trailer loc>
|
|
|
127 |
{ % Read the first 300 bytes and check for xref
|
|
|
128 |
PDFfile 0 setfileposition
|
|
|
129 |
300 string 0 1 299 { 2 copy PDFfile read pop put pop } for
|
|
|
130 |
(xref) search {
|
|
|
131 |
% found 'xref'
|
|
|
132 |
exch pop exch pop length 4 add PDFfile exch setfileposition
|
|
|
133 |
PDFfile token pop % get starting entry - or 'trailer'
|
|
|
134 |
(trailer) ne { % if we do not already have 'trailer'
|
|
|
135 |
PDFfile token pop % get number of entries
|
|
|
136 |
PDFfile token pop pop % this moves us into the middle of the first entry
|
|
|
137 |
25 string exch % define working string for readline
|
|
|
138 |
{ PDFfile 1 index readline pop pop
|
|
|
139 |
} repeat % skip entries
|
|
|
140 |
pop % pop working string
|
|
|
141 |
PDFfile token pop pop % get 'trailer'
|
|
|
142 |
PDFfile fileposition % get file position
|
|
|
143 |
} if
|
|
|
144 |
} {
|
|
|
145 |
pop 0 % no xref - should not happen
|
|
|
146 |
} ifelse
|
|
|
147 |
} bind def
|
|
|
148 |
|
|
|
149 |
% We want the location of the trailer dictionary at the end of file.
|
|
|
150 |
% We will read the last block of data and search for the final occurance
|
|
|
151 |
% of the word 'trailer'
|
|
|
152 |
/search_end_trailer % - search_end_trailer <trailer loc>
|
|
|
153 |
{ % Position to read block of data from the end of the file. Note: We ignore
|
|
|
154 |
% anything past the last %%EOF since this is not PDF data.
|
|
|
155 |
PDFfile 0 setfileposition
|
|
|
156 |
PDFfile bytesavailable post_eof_count sub % location of end of data
|
|
|
157 |
dup 4096 .min % block size to read
|
|
|
158 |
% stack: <file end pos> <block size>
|
|
|
159 |
% move file position to the start of the block
|
|
|
160 |
2 copy sub PDFfile exch setfileposition
|
|
|
161 |
% read block of data
|
|
|
162 |
dup string 0 1 4 -1 roll 1 sub { 2 copy PDFfile read pop put pop } for
|
|
|
163 |
% search for last occurance of 'trailer'
|
|
|
164 |
(trailer) { search not { exit } if pop } loop
|
|
|
165 |
% determine where the trailer is in the file
|
|
|
166 |
% trailer loc = end loc - remaing string length
|
|
|
167 |
length sub
|
|
|
168 |
} bind def
|
|
|
169 |
|
|
|
170 |
% We want to find the trailer dictionary. There is a trailer dictionary
|
|
|
171 |
% for each xref object list. We only want the trailer dictionary associated
|
|
|
172 |
% with the first xref object list. In theory this can be anywhere in the
|
|
|
173 |
% file. However since we are trying to repair a broken file, we cannot simply
|
|
|
174 |
% follow the xref links. So we are falling back to a simple strategy. We
|
|
|
175 |
% find the specified location of the first xref list. If its location is in
|
|
|
176 |
% the first half of the file then we search for the first trailer dictionary
|
|
|
177 |
% at the start of the file. Otherwise we search for the last trailer at the
|
|
|
178 |
% end of the file.
|
|
|
179 |
/search_trailer % - search_trailer -
|
|
|
180 |
{ % Find the 'startxref' and associated position at the end of the file.
|
|
|
181 |
% Position to read block of data from the end of the file. Note: We
|
|
|
182 |
% actually end at the end of the last %%EOF since this is the end of the
|
|
|
183 |
% useful PDF data. (Some files contain trailing garbage.)
|
|
|
184 |
PDFfile 0 setfileposition
|
|
|
185 |
PDFfile bytesavailable % size of file
|
|
|
186 |
post_eof_count sub dup % location of end of last %%EOF
|
|
|
187 |
dup 4096 .min % block size to read
|
|
|
188 |
% stack: <useful file size> <useful file size file> <block size>
|
|
|
189 |
% move file position to the start of the block
|
|
|
190 |
2 copy sub PDFfile exch setfileposition
|
|
|
191 |
% read block of data
|
|
|
192 |
dup string 0 1 4 -1 roll 1 sub { 2 copy PDFfile read pop put pop } for
|
|
|
193 |
% search for last occurance of 'startxref'
|
|
|
194 |
(startxref) { search not { exit } if pop } loop
|
|
|
195 |
% determine where the trailer is in the file
|
|
|
196 |
% trailer loc = end loc - remaing string length
|
|
|
197 |
length sub 9 sub
|
|
|
198 |
% move the file to this position and read startxref and position
|
|
|
199 |
PDFfile exch setfileposition
|
|
|
200 |
PDFfile token pop pop PDFfile token pop
|
|
|
201 |
% compare xref position to 1/2 the length of the file and search for trailer
|
|
|
202 |
exch 2 div lt { search_start_trailer } { search_end_trailer } ifelse
|
|
|
203 |
% get the trailer
|
|
|
204 |
PDFfile exch setfileposition % set to the specified trailer location
|
|
|
205 |
PDFfile traileropdict .pdfrun % read trailer info
|
|
|
206 |
/Trailer exch def
|
|
|
207 |
} bind def
|
|
|
208 |
|
|
|
209 |
% This routine will determine if there is stuff after the %%EOF. There is
|
|
|
210 |
% supposed to be only a line termination. However many real life files
|
|
|
211 |
% contain some garbage. This routine checks how much. We then ignore this
|
|
|
212 |
% stuff when we are scanning for objects.
|
|
|
213 |
/determine_post_eof_count % - determine_post_eof_count <count>
|
|
|
214 |
{ % Position to read block of data from the end of the file.
|
|
|
215 |
PDFfile 0 setfileposition
|
|
|
216 |
PDFfile bytesavailable % size of file
|
|
|
217 |
dup 4096 .min % block size to read
|
|
|
218 |
% stack: <file size> <file size> <block size>
|
|
|
219 |
% move file position to the start of the block
|
|
|
220 |
2 copy sub PDFfile exch setfileposition
|
|
|
221 |
% read block of data
|
|
|
222 |
dup string 0 1 4 -1 roll 1 sub { 2 copy PDFfile read pop put pop } for
|
|
|
223 |
% search for last occurance of '%%EOF'
|
|
|
224 |
(%%EOF) { search not { exit } if pop } loop
|
|
|
225 |
% how much is left = remaining string length
|
|
|
226 |
length exch pop % pop /%%EOF
|
|
|
227 |
} bind def
|
|
|
228 |
|
|
|
229 |
% This routine will scan a file searaching for object locations to build
|
|
|
230 |
% an alternate version of the data in the xref tables.
|
|
|
231 |
% Its purpose is to provide a basis for an xref fixing facility.
|
|
|
232 |
/search_objects % - search_objects -
|
|
|
233 |
{ % Initialize the Objects, Generations, etc. larrays
|
|
|
234 |
initPDFobjects
|
|
|
235 |
% reset duplicate object and generation numbers error flag
|
|
|
236 |
/dup_obj_gen_num false def
|
|
|
237 |
% Determine how many bytes are in the file after the final %%EOF
|
|
|
238 |
/post_eof_count determine_post_eof_count def
|
|
|
239 |
% Start at the beginning of the file
|
|
|
240 |
PDFfile 0 setfileposition
|
|
|
241 |
% Create a working string (and also store its length on stack). We are
|
|
|
242 |
% using a maximum size string size the logic below wants a recovered object
|
|
|
243 |
% to fit into our working string.
|
|
|
244 |
65535 dup string
|
|
|
245 |
{ % Now loop through the entire file lloking for objects
|
|
|
246 |
PDFfile fileposition % save current file position
|
|
|
247 |
% When we get near the end of the file, we use a smaller interval of
|
|
|
248 |
% our working string to prevent reading past the end. (See comments on
|
|
|
249 |
% EOF testing below.)
|
|
|
250 |
PDFfile bytesavailable post_eof_count sub 10 sub dup 4 index lt {
|
|
|
251 |
2 index 0 3 -1 roll getinterval % near EOF, use interval of string
|
|
|
252 |
} { pop 1 index % not near end, use full working string
|
|
|
253 |
}ifelse
|
|
|
254 |
% Read a line from file. If the line does not fit into our working string,
|
|
|
255 |
% or any other error, then we will discard it.
|
|
|
256 |
PDFfile exch { readline } .internalstopped
|
|
|
257 |
{ pop pop false } if % indicate no string if we stopped
|
|
|
258 |
{ % stack: <length> <working_str> <loc> <string>
|
|
|
259 |
% Now that we have line, get obj num, ref num, and 'obj'. Verify that each
|
|
|
260 |
% of these is correct type.
|
|
|
261 |
/integertype typed_token { % get obj number
|
|
|
262 |
/integertype typed_token { % get ref number
|
|
|
263 |
/nametype typed_token { % get 'obj' text
|
|
|
264 |
pop % pop remaining string
|
|
|
265 |
/obj eq { % verify name is 'obj'
|
|
|
266 |
% make sure we have room in the arrays. We work in increments
|
|
|
267 |
% of 20 each time we increase the size.
|
|
|
268 |
1 index 20 add 20 idiv 20 mul
|
|
|
269 |
growPDFobjects
|
|
|
270 |
% save xref parameters into ObjectStream, Objects and Generations
|
|
|
271 |
1 index 0 4 index 3 index % rearrange parms for setxrefentry
|
|
|
272 |
setxrefentry % save parameters
|
|
|
273 |
pop pop pop pop % clear parameters
|
|
|
274 |
} if % check if name is 'obj'
|
|
|
275 |
} if % check if we got 'obj" string
|
|
|
276 |
pop % remove ref number
|
|
|
277 |
} if % check if we got ref number
|
|
|
278 |
pop % remove obj number
|
|
|
279 |
} if % check if we got object number
|
|
|
280 |
} if % check if got a string from readline
|
|
|
281 |
pop % remove location
|
|
|
282 |
% Check if we are approaching the end of the file. We do not want to
|
|
|
283 |
% read past the end of the file since that closes it. We actually stop
|
|
|
284 |
% 10-20 bytes early since there cannot be an object that close to the end.
|
|
|
285 |
% (There is a Trailer dictionary, etc. at the end of the file.)
|
|
|
286 |
PDFfile bytesavailable post_eof_count sub 20 lt { exit } if
|
|
|
287 |
} loop % loop through the entire file
|
|
|
288 |
pop pop % remove working string and its length
|
|
|
289 |
% Output warning if we have two objects with the same object and generation
|
|
|
290 |
% numbers.
|
|
|
291 |
dup_obj_gen_num {
|
|
|
292 |
( **** Warning: There are objects with matching object and generation\n)
|
|
|
293 |
pdfformaterror
|
|
|
294 |
( **** numbers. The accuracy of the resulting image is unknown.\n)
|
|
|
295 |
pdfformaterror
|
|
|
296 |
} if
|
|
|
297 |
} bind def
|
|
|
298 |
|
|
|
299 |
% Print warning message because we found a problem while reading the xref
|
|
|
300 |
% tables
|
|
|
301 |
/print_xref_warning
|
|
|
302 |
{ ( **** Warning: An error occurred while reading an XREF table.\n)
|
|
|
303 |
pdfformaterror
|
|
|
304 |
( **** The file has been damaged. This may have been caused\n)
|
|
|
305 |
pdfformaterror
|
|
|
306 |
( **** by a problem while converting or transfering the file.\n)
|
|
|
307 |
pdfformaterror
|
|
|
308 |
( **** Ghostscript will attempt to recover the data.\n)
|
|
|
309 |
pdfformaterror
|
|
|
310 |
} bind def
|
|
|
311 |
|
|
|
312 |
% Attempt to recover the XRef data. This is called if we have a failure
|
|
|
313 |
% while reading the normal XRef tables. This routine usually works
|
|
|
314 |
% only for pre PDF1.5 versions of PDF files.
|
|
|
315 |
/recover_xref_data % - recover_xref_data -
|
|
|
316 |
{ print_xref_warning % Print warning message
|
|
|
317 |
count pdfemptycount sub { pop } repeat % remove anything left by readxref
|
|
|
318 |
search_objects % Search for objects
|
|
|
319 |
} bind def
|