Subversion Repositories planix.SVN

Rev

Rev 2 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 - 1
%    Copyright (C) 2002 artofcode LLC.  All rights reserved.
2
% 
3
% This software is provided AS-IS with no warranty, either express or
4
% implied.
5
% 
6
% This software is distributed under license and may not be copied,
7
% modified or distributed except as expressly authorized under the terms
8
% of the license contained in the file LICENSE in this distribution.
9
% 
10
% For more information about licensing, please refer to
11
% http://www.ghostscript.com/licensing/. For information on
12
% commercial licensing, go to http://www.artifex.com/licensing/ or
13
% contact Artifex Software, Inc., 101 Lucas Valley Road #110,
14
% San Rafael, CA  94903, U.S.A., +1(415)492-9861.
15
 
16
% $Id: pdf_rbld.ps,v 1.8 2005/02/07 06:38:02 dan Exp $
17
% pdf_rbld.ps - Rebuilding of broken PDF files (xref errors)
18
 
19
% This module contains routines that are used if we detect an error
20
% while reading the xref tables.  These routines will scan the file and
21
% build an xref table by finding the objects.  We also need to find the
22
% appropriate trailer dictionary.  Note:  One procedure is also used
23
% even if we do not need to rebuild a PDF file.
24
%
25
% This module cannot rebuild a PDF file which has had errors created inside
26
% of objects or binary data streams.  It often succeeds with files that
27
% have had its end of lines converted between unix and dos versions.
28
 
29
% if true --> we have an object with duplicate object and generation numbers.
30
/dup_obj_gen_num false def
31
 
32
% Note:  This routine is also used by non-rebuild code.
33
% Store a line in the xref array (Actually Objects and Generations arrays)
34
% <obj num> (strm num> <obj loc> <gen num>  setxrefentry <obj num> strm num>
35
% 						 	 <obj loc> <gen num>
36
/setxrefentry
37
{	% We store generation numbers as value + 1
38
	% We reserve 0 to indicate an free xref entry
39
  1 add			% increment generation number
40
	% To save space, generations numbers are stored in a lstring unless we
41
	% find a generation number greater than 255.  If so then transfer to 
42
	% an larray.
43
  dup 255 gt {
44
    Generations ltype /stringtype eq {	% Convert Generations to an larray.
45
      larray Generations llength lgrowto dup	% Create new larray
46
 
47
	Generations 1 index lget lput dup
48
      } for
49
      pop
50
      /Generations exch store		% Save new Generations larray
51
    } if
52
  } if
53
	% Verify that the new values are for a new object.  If the current
54
	% entry is null then we have a new entry.
55
  Objects 4 index lget null eq {
56
    ObjectStream 4 index 4 index cvx lput % Save ObjectStream object number
57
    Objects 4 index 3 index cvx lput	% Save object location
58
    Generations 4 index 2 index lput	% Save geenration number
59
  } {
60
	% Verify that the new entry has at least as high a generaton number
61
	% We accept equal entry number because we have found PDF files in
62
	% which there are multiple objects with the same object and entry
63
	% numbers.  The normal xref logic only accepts the first such
64
	% entry that it finds.  However the 'rebuild PDF' logic can find
65
	% both such entries.  The correct one is usually the last one.
66
    Generations 4 index lget 1 index le {
67
      ObjectStream 4 index 4 index cvx lput % Save ObjectStream object number
68
      Objects 4 index 3 index cvx lput	% Save object location
69
      Generations 4 index 2 index lput	% Save geenration number
70
    } if
71
	% Set error flag if we have equal object and generation numbers
72
    Generations 4 index lget 1 index eq { /dup_obj_gen_num true def } if
73
  } ifelse
74
} bind def
75
 
76
% Print the contents of the xref array.  This actually consists of two
77
% arrays (Objects and Generations).  Both are larrays.  larrays are a
78
% special Ghostscript object which can be arrays with more than 64k
79
% elements.
80
/print_xref				% - print_xref -
81
{ 0 1 Objects llength 1 sub		% stack: 0 1 <number of objects - 1>
82
  { dup =only				% print object number
83
    (  ) print
84
    dup Generations exch lget 1 sub =only % print Generation number
85
    (  ) print
86
    dup ObjectStream exch lget ==only	% print ObjectStream object number
87
    (  ) print
88
    Objects exch lget ===		% print object location
89
  } for
90
  flush
91
} bind def
92
 
93
% This is the same as the postscript token operator except that
94
% errors are ignored.
95
/token_nofail
96
{
97
  { token } .internalstopped
98
  { pop false } if
99
} bind odef
100
 
101
% Get token from string and check its type
102
%   <string> <type> typed_token <false>		% no token or not match
103
%   <string> <type> typed_token <obj> <last> <true>	% matching token type
104
% Where last is the string remainder
105
/typed_token
106
{ exch
107
  token_nofail			% get token
108
  {
109
    dup type			% stack:  type last token type
110
    4 -1 roll eq {		% stack:  last token bool
111
      exch true			% desired object found - set exit status
112
    } {
113
      pop pop false		% not type - clear stack, set exit status
114
    } ifelse
115
  } {
116
    pop false			% no token - pop type, set exit status
117
  } ifelse			% check if we got token
118
} bind def
119
 
120
% Allocate space for post_eof_count to be bound into procedures below.
121
/post_eof_count 0 def
122
 
123
% We want the location of the trailer dictionary at the start of file.
124
% First we will find the xref.  Then we will skip over the xref entries
125
% to the trailer.
126
/search_start_trailer		% - search_start_trailer <trailer loc>
127
{ % Read the first 300 bytes and check for xref
128
  PDFfile 0 setfileposition
129
  300 string 0 1 299 { 2 copy PDFfile read pop put pop } for
130
  (xref) search {
131
    % found 'xref'
132
    exch pop exch pop length 4 add PDFfile exch setfileposition
133
    PDFfile token pop		% get starting entry - or 'trailer'
134
    (trailer) ne {		% if we do not already have 'trailer'
135
      PDFfile token pop		% get number of entries
136
      PDFfile token pop pop	% this moves us into the middle of the first entry
137
      25 string exch		% define working string for readline
138
      { PDFfile 1 index readline pop pop
139
      } repeat			% skip entries
140
      pop			% pop working string
141
      PDFfile token pop pop	% get 'trailer'
142
      PDFfile fileposition	% get file position
143
    } if
144
  } {
145
    pop 0			% no xref - should not happen
146
  } ifelse
147
} bind def
148
 
149
% We want the location of the trailer dictionary at the end of file.
150
% We will read the last block of data and search for the final occurance
151
% of the word 'trailer'
152
/search_end_trailer		% - search_end_trailer <trailer loc>
153
{ % Position to read block of data from the end of the file.  Note:  We ignore
154
  % anything past the last %%EOF since this is not PDF data.
155
  PDFfile 0 setfileposition
156
  PDFfile bytesavailable post_eof_count sub	% location of end of data
157
  dup 4096 .min			% block size to read
158
				% stack: <file end pos> <block size>
159
  % move file position to the start of the block
160
  2 copy sub PDFfile exch setfileposition
161
  % read block of data
162
  dup string 0 1 4 -1 roll 1 sub { 2 copy PDFfile read pop put pop } for
163
  % search for last occurance of 'trailer'
164
  (trailer) { search not { exit } if pop } loop
165
  % determine where the trailer is in the file
166
  %   trailer loc = end loc - remaing string length
167
  length sub 
168
} bind def
169
 
170
% We want to find the trailer dictionary.  There is a trailer dictionary
171
% for each xref object list.  We only want the trailer dictionary associated
172
% with the first xref object list.  In theory this can be anywhere in the
173
% file.  However since we are trying to repair a broken file, we cannot simply
174
% follow the xref links.  So we are falling back to a simple strategy.  We
175
% find the specified location of the first xref list.  If its location is in
176
% the first half of the file then we search for the first trailer dictionary
177
% at the start of the file.  Otherwise we search for the last trailer at the
178
% end of the file.
179
/search_trailer			% - search_trailer -
180
{ % Find the 'startxref' and associated position at the end of the file.
181
  % Position to read block of data from the end of the file.  Note:  We
182
  % actually end at the end of the last %%EOF since this is the end of the
183
  % useful PDF data.  (Some files contain trailing garbage.)
184
  PDFfile 0 setfileposition
185
  PDFfile bytesavailable	% size of file
186
  post_eof_count sub dup	% location of end of last %%EOF
187
  dup 4096 .min			% block size to read
188
  % stack: <useful file size> <useful file size file> <block size>
189
  % move file position to the start of the block
190
  2 copy sub PDFfile exch setfileposition
191
  % read block of data
192
  dup string 0 1 4 -1 roll 1 sub { 2 copy PDFfile read pop put pop } for
193
  % search for last occurance of 'startxref'
194
  (startxref) { search not { exit } if pop } loop
195
  % determine where the trailer is in the file
196
  %   trailer loc = end loc - remaing string length
197
  length sub 9 sub
198
  % move the file to this position and read startxref and position
199
  PDFfile exch setfileposition
200
  PDFfile token pop pop PDFfile token pop
201
  % compare xref position to 1/2 the length of the file and search for trailer
202
  exch 2 div lt { search_start_trailer } { search_end_trailer } ifelse
203
  % get the trailer
204
  PDFfile exch setfileposition		% set to the specified trailer location
205
  PDFfile traileropdict .pdfrun		% read trailer info
206
  /Trailer exch def
207
} bind def
208
 
209
% This routine will determine if there is stuff after the %%EOF.  There is
210
% supposed to be only a line termination.  However many real life files
211
% contain some garbage.  This routine checks how much.  We then ignore this
212
% stuff when we are scanning for objects.
213
/determine_post_eof_count		% - determine_post_eof_count <count>
214
{ % Position to read block of data from the end of the file. 
215
  PDFfile 0 setfileposition
216
  PDFfile bytesavailable	% size of file
217
  dup 4096 .min			% block size to read
218
  % stack: <file size> <file size> <block size>
219
  % move file position to the start of the block
220
  2 copy sub PDFfile exch setfileposition
221
  % read block of data
222
  dup string 0 1 4 -1 roll 1 sub { 2 copy PDFfile read pop put pop } for
223
  % search for last occurance of '%%EOF'
224
  (%%EOF) { search not { exit } if pop } loop
225
  % how much is left = remaining string length
226
  length exch pop		% pop /%%EOF
227
} bind def
228
 
229
% This routine will scan a file searaching for object locations to build
230
% an alternate version of the data in the xref tables.
231
% Its purpose is to provide a basis for an xref fixing facility.
232
/search_objects				% - search_objects -
233
{ % Initialize the Objects, Generations, etc. larrays
234
  initPDFobjects
235
  % reset duplicate object and generation numbers error flag
236
  /dup_obj_gen_num false def
237
  % Determine how many bytes are in the file after the final %%EOF
238
  /post_eof_count determine_post_eof_count def
239
  % Start at the beginning of the file
240
  PDFfile 0 setfileposition
241
  % Create a working string (and also store its length on stack).  We are
242
  % using a maximum size string size the logic below wants a recovered object
243
  % to fit into our working string.
244
  65535 dup string
245
  { % Now loop through the entire file lloking for objects
246
    PDFfile fileposition		% save current file position
247
    % When we get near the end of the file, we use a smaller interval of
248
    % our working string to prevent reading past the end.  (See comments on
249
    % EOF testing below.)
250
    PDFfile bytesavailable post_eof_count sub 10 sub dup 4 index lt {
251
      2 index 0 3 -1 roll getinterval	% near EOF, use interval of string
252
    } { pop 1 index			% not near end, use full working string
253
    }ifelse
254
    % Read a line from file.  If the line does not fit into our working string,
255
    % or any other error, then we will discard it.
256
    PDFfile exch { readline } .internalstopped
257
    { pop pop false } if		% indicate no string if we stopped
258
    { % stack: <length> <working_str> <loc> <string>
259
      % Now that we have line, get obj num, ref num, and 'obj'.  Verify that each
260
      % of these is correct type.
261
      /integertype typed_token {	% get obj number
262
        /integertype typed_token {	% get ref number
263
          /nametype typed_token {	% get 'obj' text
264
	    pop				% pop remaining string
265
	    /obj eq {			% verify name is 'obj'
266
	      % make sure we have room in the arrays.  We work in increments
267
	      % of 20 each time we increase the size.
268
	      1 index 20 add 20 idiv 20 mul
269
	      growPDFobjects
270
	      % save xref parameters into ObjectStream, Objects and Generations
271
	      1 index 0 4 index 3 index	% rearrange parms for setxrefentry
272
	      setxrefentry		% save parameters
273
	      pop pop pop pop		% clear parameters
274
	    } if			% check if name is 'obj'
275
          } if				% check if we got 'obj" string
276
          pop				% remove ref number
277
        } if				% check if we got ref number
278
        pop				% remove obj number
279
      } if				% check if we got object number
280
    } if				% check if got a string from readline
281
    pop					% remove location
282
    % Check if we are approaching the end of the file.  We do not want to
283
    % read past the end of the file since that closes it.  We actually stop
284
    % 10-20 bytes early since there cannot be an object that close to the end.
285
    % (There is a Trailer dictionary, etc. at the end of the file.)
286
    PDFfile bytesavailable post_eof_count sub 20 lt { exit } if
287
  } loop				% loop through the entire file
288
  pop pop				% remove working string and its length
289
  % Output warning if we have two objects with the same object and generation
290
  % numbers.
291
  dup_obj_gen_num {
292
    (   **** Warning:  There are objects with matching object and generation\n)
293
    pdfformaterror
294
    (   **** numbers.  The accuracy of the resulting image is unknown.\n)
295
    pdfformaterror
296
  } if
297
} bind def
298
 
299
% Print warning message because we found a problem while reading the xref
300
% tables
301
/print_xref_warning
302
{ (   **** Warning:  An error occurred while reading an XREF table.\n)
303
  pdfformaterror
304
  (   **** The file has been damaged.  This may have been caused\n)
305
  pdfformaterror
306
  (   **** by a problem while converting or transfering the file.\n)
307
  pdfformaterror
308
  (   **** Ghostscript will attempt to recover the data.\n)
309
  pdfformaterror
310
} bind def
311
 
312
% Attempt to recover the XRef data.  This is called if we have a failure
313
% while reading the normal XRef tables.  This routine usually works
314
% only for pre PDF1.5 versions of PDF files.
315
/recover_xref_data		% - recover_xref_data -
316
{ print_xref_warning		% Print warning message
317
  count pdfemptycount sub { pop } repeat % remove anything left by readxref
318
  search_objects		% Search for objects
319
} bind def