Annotation of OpenXM/src/asir-doc/jtexindex/C/texindex.c, Revision 1.2
1.1 noro 1: /* Prepare TeX index dribble output into an actual index.
2:
3: Version 1.45
4:
5: Copyright (C) 1987, 1991, 1992 Free Software Foundation, Inc.
6:
7: This program is free software; you can redistribute it and/or modify
8: it under the terms of the GNU General Public License as published by
9: the Free Software Foundation; either version 2, or (at your option)
10: any later version.
11:
12: This program is distributed in the hope that it will be useful,
13: but WITHOUT ANY WARRANTY; without even the implied warranty of
14: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15: GNU General Public License for more details.
16:
17: You should have received a copy of the GNU General Public License
18: along with this program; if not, write to the Free Software
19: Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307. */
20:
21: /*
22: 95.10.18 modified by Nobuyosi KAMEI <CXK03514@niftyserve.or.jp>
23: 日本語EUCが処理できるようにした。
24: 95.10.19 modified by K. Handa <handa@etl.go.jp>
25: \initial{...} の出力を調整。
26: */
27:
28:
29: #include <stdio.h>
30: #include <ctype.h>
31: #include <errno.h>
32: #include <assert.h>
33: #include "getopt.h"
34:
35: #define TEXINDEX_VERSION_STRING "GNU Texindex 2.0 for Texinfo release 3.4"
36: #define TEXINDEX_MODIFICATION_STRING "(modified for Japanese EUC encoding)"
37:
38: #if defined (emacs)
39: # include "../src/config.h"
40: /* Some s/os.h files redefine these. */
41: # undef read
42: # undef close
43: # undef write
44: # undef open
45: #endif
46:
1.2 ! ohara 47: #if defined(__MINGW32__)
! 48: #if !defined(_BSDTYPES_DEFINED)
! 49: typedef unsigned char u_char;
! 50: typedef unsigned short u_short;
! 51: typedef unsigned int u_int;
! 52: typedef unsigned long u_long;
! 53: #define _BSDTYPES_DEFINED
! 54: #endif /* _BSDTYPES_DEFINED */
! 55: #endif /* __MINGW32__*/
! 56:
1.1 noro 57: #if defined (HAVE_STRING_H)
58: # include <string.h>
59: #endif /* HAVE_STRING_H */
60:
61: #if !defined (HAVE_STRCHR)
62: char *strrchr ();
63: #endif /* !HAVE_STRCHR */
64:
65: #if defined (STDC_HEADERS)
66: # include <stdlib.h>
67: #else /* !STDC_HEADERS */
68: char *getenv (), *malloc (), *realloc ();
69: #endif /* !STDC_HEADERS */
70:
71: #if defined (HAVE_UNISTD_H)
72: # include <unistd.h>
73: #else /* !HAVE_UNISTD_H */
74: off_t lseek ();
75: #endif /* !HAVE_UNISTD_H */
76:
77: #if !defined (HAVE_MEMSET)
78: #undef memset
79: #define memset(ptr, ignore, count) bzero (ptr, count)
80: #endif
81:
82:
83: char *mktemp ();
84:
85: #if defined (VMS)
86: # include <file.h>
87: # define TI_NO_ERROR ((1 << 28) | 1)
88: # define TI_FATAL_ERROR ((1 << 28) | 4)
89: # define unlink delete
90: #else /* !VMS */
91: # if defined (HAVE_SYS_FCNTL_H)
92: # include <sys/types.h>
93: # include <sys/fcntl.h>
94: # endif /* HAVE_SYS_FCNTL_H */
95:
96: # if defined (_AIX) || !defined (_POSIX_VERSION)
97: # include <sys/file.h>
98: # else /* !AIX && _POSIX_VERSION */
99: # if !defined (HAVE_SYS_FCNTL_H)
100: # include <fcntl.h>
101: # endif /* !HAVE_FCNTL_H */
102: # endif /* !_AIX && _POSIX_VERSION */
103: # define TI_NO_ERROR 0
104: # define TI_FATAL_ERROR 1
105: #endif /* !VMS */
106:
107: #if !defined (SEEK_SET)
108: # define SEEK_SET 0
109: # define SEEK_CUR 1
110: # define SEEK_END 2
111: #endif /* !SEEK_SET */
112:
113: #if !defined (errno)
114: extern int errno;
115: #endif
116: char *strerror ();
117:
118: /* When sorting in core, this structure describes one line
119: and the position and length of its first keyfield. */
120: struct lineinfo
121: {
122: u_short *text; /* The actual text of the line. */
123: union {
124: u_short *text; /* The start of the key (for textual comparison). */
125: long number; /* The numeric value (for numeric comparison). */
126: } key;
127: long keylen; /* Length of KEY field. */
128: };
129:
130: /* This structure describes a field to use as a sort key. */
131: struct keyfield
132: {
133: int startwords; /* Number of words to skip. */
134: int startchars; /* Number of additional chars to skip. */
135: int endwords; /* Number of words to ignore at end. */
136: int endchars; /* Ditto for characters of last word. */
137: char ignore_blanks; /* Non-zero means ignore spaces and tabs. */
138: char fold_case; /* Non-zero means case doesn't matter. */
139: char reverse; /* Non-zero means compare in reverse order. */
140: char numeric; /* Non-zeros means field is ASCII numeric. */
141: char positional; /* Sort according to file position. */
142: char braced; /* Count balanced-braced groupings as fields. */
143: };
144:
145: /* Vector of keyfields to use. */
146: struct keyfield keyfields[3];
147:
148: /* Number of keyfields stored in that vector. */
149: int num_keyfields = 3;
150:
151: /* Vector of input file names, terminated with a null pointer. */
152: char **infiles;
153:
154: /* Vector of corresponding output file names, or NULL, meaning default it
155: (add an `s' to the end). */
156: char **outfiles;
157:
158: /* Length of `infiles'. */
159: int num_infiles;
160:
161: /* Pointer to the array of pointers to lines being sorted. */
162: u_short **linearray;
163:
164: /* The allocated length of `linearray'. */
165: long nlines;
166:
167: /* Directory to use for temporary files. On Unix, it ends with a slash. */
168: char *tempdir;
169:
170: /* Start of filename to use for temporary files. */
171: char *tempbase;
172:
173: /* Number of last temporary file. */
174: int tempcount;
175:
176: /* Number of last temporary file already deleted.
177: Temporary files are deleted by `flush_tempfiles' in order of creation. */
178: int last_deleted_tempcount;
179:
180: /* During in-core sort, this points to the base of the data block
181: which contains all the lines of data. */
182: u_short *text_base;
183:
184: /* Additional command switches .*/
185:
186: /* Nonzero means do not delete tempfiles -- for debugging. */
187: int keep_tempfiles;
188:
189: /* The name this program was run with. */
190: char *program_name;
191:
192: /* 仮名文字の母音テーブル */
193: int Vowel_Table[0x60];
194:
195: /* Forward declarations of functions in this file. */
196:
197: void decode_command ();
198: void sort_in_core ();
199: void sort_offline ();
200: u_short **parsefile ();
201: u_short *find_field ();
202: u_short *find_pos ();
203: long find_value ();
204: u_short *find_braced_pos (u_short *str, int words, int chars, int ignore_blanks);
205: u_short *find_braced_end ();
206: void writelines (u_short **linearray, int nlines, FILE *ostream);
207: int compare_field ();
208: int compare_full ();
209: long readline ();
210: int merge_files ();
211: int merge_direct ();
212: void pfatal_with_name ();
213: void fatal ();
214: void error ();
215: void *xmalloc (), *xrealloc ();
216: char *concat ();
217: char *maketempname ();
218: void flush_tempfiles ();
219: char *tempcopy ();
220: static void memory_error();
221: static int wstrncmp(const u_short *s1, const u_short *s2, size_t n);
222: static int wfwrite(u_short *ptr, size_t size, size_t nmemb, FILE*stream);
223: static int wtoc(char *cptr, u_short const *ptr, int len);
224: static int Vowel_of(int c);
225: static void init_Vowel_Table(void);
226:
227: #define EUC_BYTE(c) (0x00A1 <= c && c <= 0x00FE)
228: #define G1_CHAR(c) (c & 0xFF00)
229:
230: #define MAX_IN_CORE_SORT 500000
231:
232: void
233: main (argc, argv)
234: int argc;
235: char **argv;
236: {
237: int i;
238:
239: tempcount = 0;
240: last_deleted_tempcount = 0;
241:
242: program_name = strrchr (argv[0], '/');
243: if (program_name != (char *)NULL)
244: program_name++;
245: else
246: program_name = argv[0];
247:
248: /* Describe the kind of sorting to do. */
249: /* The first keyfield uses the first braced field and folds case. */
250: keyfields[0].braced = 1;
251: keyfields[0].fold_case = 1;
252: keyfields[0].endwords = -1;
253: keyfields[0].endchars = -1;
254:
255: /* The second keyfield uses the second braced field, numerically. */
256: keyfields[1].braced = 1;
257: keyfields[1].numeric = 1;
258: keyfields[1].startwords = 1;
259: keyfields[1].endwords = -1;
260: keyfields[1].endchars = -1;
261:
262: /* The third keyfield (which is ignored while discarding duplicates)
263: compares the whole line. */
264: keyfields[2].endwords = -1;
265: keyfields[2].endchars = -1;
266:
267: decode_command (argc, argv);
268:
269: tempbase = mktemp (concat ("txiXXXXXX", "", ""));
270:
271: /* Process input files completely, one by one. */
272:
273: for (i = 0; i < num_infiles; i++)
274: {
275: int desc;
276: long ptr;
277: char *outfile;
278:
279: desc = open (infiles[i], O_RDONLY, 0);
280: if (desc < 0)
281: pfatal_with_name (infiles[i]);
282: lseek (desc, (off_t) 0, SEEK_END);
283: ptr = (long) lseek (desc, (off_t) 0, SEEK_CUR);
284:
285: close (desc);
286:
287: outfile = outfiles[i];
288: if (!outfile)
289: {
290: outfile = concat (infiles[i], "s", "");
291: }
292:
293: if (ptr < MAX_IN_CORE_SORT)
294: /* Sort a small amount of data. */
295: sort_in_core (infiles[i], ptr, outfile);
296: else
297: sort_offline (infiles[i], ptr, outfile);
298: }
299:
300: flush_tempfiles (tempcount);
301: exit (TI_NO_ERROR);
302: }
303:
304: typedef struct
305: {
306: char *long_name;
307: char *short_name;
308: int *variable_ref;
309: int variable_value;
310: char *arg_name;
311: char *doc_string;
312: } TEXINDEX_OPTION;
313:
314: TEXINDEX_OPTION texindex_options[] = {
315: { "--keep", "-k", &keep_tempfiles, 1, (char *)NULL,
316: "Keep temporary files around after processing" },
317: { "--no-keep", 0, &keep_tempfiles, 0, (char *)NULL,
318: "Do not keep temporary files around after processing (default)" },
319: { "--output", "-o", (int *)NULL, 0, "FILE",
320: "Send output to FILE" },
321: { "--version", (char *)NULL, (int *)NULL, 0, (char *)NULL,
322: "Show version information" },
323: { "--help", "-h", (int *)NULL, 0, (char *)NULL, "Produce this listing" },
324: { (char *)NULL, (char *)NULL, (int *)NULL, 0, (char *)NULL }
325: };
326:
327: void
328: usage (result_value)
329: int result_value;
330: {
331: register int i;
332:
333: fprintf (stderr, "Usage: %s [OPTIONS] FILE...\n", program_name);
334: fprintf (stderr, " Generate a permuted index for the TeX files given.\n");
335: fprintf (stderr, " Usually FILE... is `foo.??' for the source file `foo.tex'.\n");
336: fprintf (stderr, " The OPTIONS are:\n");
337:
338: for (i = 0; texindex_options[i].long_name; i++)
339: {
340: fprintf (stderr, " %s %s",
341: texindex_options[i].long_name,
342: texindex_options[i].arg_name ?
343: texindex_options[i].arg_name : "");
344:
345: if (texindex_options[i].short_name)
346: fprintf (stderr, " \n or %s %s",
347: texindex_options[i].short_name,
348: texindex_options[i].arg_name ?
349: texindex_options[i].arg_name : "");
350: fprintf (stderr, "\t%s\n", texindex_options[i].doc_string);
351: }
352: exit (result_value);
353: }
354:
355: /* Decode the command line arguments to set the parameter variables
356: and set up the vector of keyfields and the vector of input files. */
357:
358: void
359: decode_command (argc, argv)
360: int argc;
361: char **argv;
362: {
363: int arg_index = 1;
364: char **ip;
365: char **op;
366:
367: /* Store default values into parameter variables. */
368:
369: tempdir = getenv ("TMPDIR");
370: #ifdef VMS
371: if (tempdir == NULL)
372: tempdir = "sys$scratch:";
373: #else
374: if (tempdir == NULL)
375: tempdir = "/tmp/";
376: else
377: tempdir = concat (tempdir, "/", "");
378: #endif
379:
380: keep_tempfiles = 0;
381:
382: /* Allocate ARGC input files, which must be enough. */
383:
384: infiles = (char **) xmalloc (argc * sizeof (char *));
385: outfiles = (char **) xmalloc (argc * sizeof (char *));
386: ip = infiles;
387: op = outfiles;
388:
389: while (arg_index < argc)
390: {
391: char *arg = argv[arg_index++];
392:
393: if (*arg == '-')
394: {
395: if (strcmp (arg, "--version") == 0)
396: {
397: fprintf (stderr, "%s", TEXINDEX_VERSION_STRING);
398: #ifdef TEXINDEX_MODIFICATION_STRING
399: fprintf (stderr, " %s", TEXINDEX_MODIFICATION_STRING);
400: #endif
401: fprintf (stderr, "\n");
402: exit (0);
403: }
404: else if ((strcmp (arg, "--keep") == 0) ||
405: (strcmp (arg, "-k") == 0))
406: {
407: keep_tempfiles = 1;
408: }
409: else if ((strcmp (arg, "--help") == 0) ||
410: (strcmp (arg, "-h") == 0))
411: {
412: usage (0);
413: }
414: else if ((strcmp (arg, "--output") == 0) ||
415: (strcmp (arg, "-o") == 0))
416: {
417: if (argv[arg_index] != (char *)NULL)
418: {
419: arg_index++;
420: if (op > outfiles)
421: *(op - 1) = argv[arg_index];
422: }
423: else
424: usage (1);
425: }
426: else
427: usage (1);
428: }
429: else
430: {
431: *ip++ = arg;
432: *op++ = (char *)NULL;
433: }
434: }
435:
436: /* Record number of keyfields and terminate list of filenames. */
437: num_infiles = ip - infiles;
438: *ip = (char *)NULL;
439: if (num_infiles == 0)
440: usage (1);
441: }
442:
443: /* Return a name for a temporary file. */
444:
445: char *
446: maketempname (count)
447: int count;
448: {
449: char tempsuffix[10];
450: sprintf (tempsuffix, "%d", count);
451: return concat (tempdir, tempbase, tempsuffix);
452: }
453:
454: /* Delete all temporary files up to TO_COUNT. */
455:
456: void
457: flush_tempfiles (to_count)
458: int to_count;
459: {
460: if (keep_tempfiles)
461: return;
462: while (last_deleted_tempcount < to_count)
463: unlink (maketempname (++last_deleted_tempcount));
464: }
465:
466: /* Copy the input file open on IDESC into a temporary file
467: and return the temporary file name. */
468:
469: #define BUFSIZE 1024
470:
471: char *
472: tempcopy (idesc)
473: int idesc;
474: {
475: char *outfile = maketempname (++tempcount);
476: int odesc;
477: char buffer[BUFSIZE];
478:
479: odesc = open (outfile, O_WRONLY | O_CREAT, 0666);
480:
481: if (odesc < 0)
482: pfatal_with_name (outfile);
483:
484: while (1)
485: {
486: int nread = read (idesc, buffer, BUFSIZE);
487: write (odesc, buffer, nread);
488: if (!nread)
489: break;
490: }
491:
492: close (odesc);
493:
494: return outfile;
495: }
496:
497: /* Compare LINE1 and LINE2 according to the specified set of keyfields. */
498:
499: int
500: compare_full (line1, line2)
501: u_short **line1, **line2;
502: {
503: int i;
504:
505: /* Compare using the first keyfield;
506: if that does not distinguish the lines, try the second keyfield;
507: and so on. */
508:
509: for (i = 0; i < num_keyfields; i++)
510: {
511: long length1, length2;
512: u_short *start1 = find_field (&keyfields[i], *line1, &length1);
513: u_short *start2 = find_field (&keyfields[i], *line2, &length2);
514: int tem = compare_field (&keyfields[i], start1, length1, *line1 - text_base,
515: start2, length2, *line2 - text_base);
516: if (tem)
517: {
518: if (keyfields[i].reverse)
519: return -tem;
520: return tem;
521: }
522: }
523:
524: return 0; /* Lines match exactly. */
525: }
526:
527: /* Compare LINE1 and LINE2, described by structures
528: in which the first keyfield is identified in advance.
529: For positional sorting, assumes that the order of the lines in core
530: reflects their nominal order. */
531:
532: int
533: compare_prepared (line1, line2)
534: struct lineinfo *line1, *line2;
535: {
536: int i;
537: int tem;
538: u_short *text1, *text2;
539:
540: /* Compare using the first keyfield, which has been found for us already. */
541: if (keyfields->positional)
542: {
543: if (line1->text - text_base > line2->text - text_base)
544: tem = 1;
545: else
546: tem = -1;
547: }
548: else if (keyfields->numeric)
549: tem = line1->key.number - line2->key.number;
550: else
551: tem = compare_field (keyfields, line1->key.text, line1->keylen, 0,
552: line2->key.text, line2->keylen, 0);
553: if (tem)
554: {
555: if (keyfields->reverse)
556: return -tem;
557: return tem;
558: }
559:
560: text1 = line1->text;
561: text2 = line2->text;
562:
563: /* Compare using the second keyfield;
564: if that does not distinguish the lines, try the third keyfield;
565: and so on. */
566:
567: for (i = 1; i < num_keyfields; i++)
568: {
569: long length1, length2;
570: u_short *start1 = find_field (&keyfields[i], text1, &length1);
571: u_short *start2 = find_field (&keyfields[i], text2, &length2);
572: int tem = compare_field (&keyfields[i], start1, length1, text1 - text_base,
573: start2, length2, text2 - text_base);
574: if (tem)
575: {
576: if (keyfields[i].reverse)
577: return -tem;
578: return tem;
579: }
580: }
581:
582: return 0; /* Lines match exactly. */
583: }
584:
585: /* Like compare_full but more general.
586: You can pass any strings, and you can say how many keyfields to use.
587: POS1 and POS2 should indicate the nominal positional ordering of
588: the two lines in the input. */
589:
590: int
591: compare_general (str1, str2, pos1, pos2, use_keyfields)
592: u_short *str1, *str2;
593: long pos1, pos2;
594: int use_keyfields;
595: {
596: int i;
597:
598: /* Compare using the first keyfield;
599: if that does not distinguish the lines, try the second keyfield;
600: and so on. */
601:
602: for (i = 0; i < use_keyfields; i++)
603: {
604: long length1, length2;
605: u_short *start1 = find_field (&keyfields[i], str1, &length1);
606: u_short *start2 = find_field (&keyfields[i], str2, &length2);
607: int tem = compare_field (&keyfields[i], start1, length1, pos1,
608: start2, length2, pos2);
609: if (tem)
610: {
611: if (keyfields[i].reverse)
612: return -tem;
613: return tem;
614: }
615: }
616:
617: return 0; /* Lines match exactly. */
618: }
619:
620: /* Find the start and length of a field in STR according to KEYFIELD.
621: A pointer to the starting character is returned, and the length
622: is stored into the int that LENGTHPTR points to. */
623:
624: u_short *
625: find_field (keyfield, str, lengthptr)
626: struct keyfield *keyfield;
627: u_short *str;
628: long *lengthptr;
629: {
630: u_short *start;
631: u_short *end;
632: u_short *(*fun) ();
633:
634: if (keyfield->braced)
635: fun = find_braced_pos;
636: else
637: fun = find_pos;
638:
639: start = (*fun) (str, keyfield->startwords, keyfield->startchars,
640: keyfield->ignore_blanks);
641: if (keyfield->endwords < 0)
642: {
643: if (keyfield->braced)
644: end = find_braced_end (start);
645: else
646: {
647: end = start;
648: while (*end && *end != '\n')
649: end++;
650: }
651: }
652: else
653: {
654: end = (*fun) (str, keyfield->endwords, keyfield->endchars, 0);
655: if (end - str < start - str)
656: end = start;
657: }
658: *lengthptr = end - start;
659: return start;
660: }
661:
662: /* Return a pointer to a specified place within STR,
663: skipping (from the beginning) WORDS words and then CHARS chars.
664: If IGNORE_BLANKS is nonzero, we skip all blanks
665: after finding the specified word. */
666:
667: u_short *
668: find_pos (str, words, chars, ignore_blanks)
669: u_short *str;
670: int words, chars;
671: int ignore_blanks;
672: {
673: int i;
674: u_short *p = str;
675:
676: for (i = 0; i < words; i++)
677: {
678: u_short c;
679: /* Find next bunch of nonblanks and skip them. */
680: while ((c = *p) == ' ' || c == '\t')
681: p++;
682: while ((c = *p) && c != '\n' && !(c == ' ' || c == '\t'))
683: p++;
684: if (!*p || *p == '\n')
685: return p;
686: }
687:
688: while (*p == ' ' || *p == '\t')
689: p++;
690:
691: for (i = 0; i < chars; i++)
692: {
693: if (!*p || *p == '\n')
694: break;
695: p++;
696: }
697: return p;
698: }
699:
700: /* Like find_pos but assumes that each field is surrounded by braces
701: and that braces within fields are balanced. */
702:
703: u_short *
704: find_braced_pos (str, words, chars, ignore_blanks)
705: u_short *str;
706: int words, chars;
707: int ignore_blanks;
708: {
709: int i;
710: int bracelevel;
711: u_short *p = str;
712: u_short c;
713:
714: for (i = 0; i < words; i++)
715: {
716: bracelevel = 1;
717: while ((c = *p++) != '{' && c != '\n' && c)
718: /* Do nothing. */ ;
719: if (c != '{')
720: return p - 1;
721: while (bracelevel)
722: {
723: c = *p++;
724: if (c == '{')
725: bracelevel++;
726: if (c == '}')
727: bracelevel--;
728: if (c == 0 || c == '\n')
729: return p - 1;
730: }
731: }
732:
733: while ((c = *p++) != '{' && c != '\n' && c)
734: /* Do nothing. */ ;
735:
736: if (c != '{')
737: return p - 1;
738:
739: if (ignore_blanks)
740: while ((c = *p) == ' ' || c == '\t')
741: p++;
742:
743: for (i = 0; i < chars; i++)
744: {
745: if (!*p || *p == '\n')
746: break;
747: p++;
748: }
749: return p;
750: }
751:
752: /* Find the end of the balanced-brace field which starts at STR.
753: The position returned is just before the closing brace. */
754:
755: u_short *
756: find_braced_end (str)
757: u_short *str;
758: {
759: int bracelevel;
760: u_short *p = str;
761: u_short c;
762:
763: bracelevel = 1;
764: while (bracelevel)
765: {
766: c = *p++;
767: if (c == '{')
768: bracelevel++;
769: if (c == '}')
770: bracelevel--;
771: if (c == 0 || c == '\n')
772: return p - 1;
773: }
774: return p - 1;
775: }
776:
777: long
778: find_value (start, length)
779: u_short *start;
780: long length;
781: {
782: long val;
783:
784: while (length != 0L)
785: {
786: if (isdigit (*start))
787: break;
788: length--;
789: start++;
790: }
791:
792: val = 0;
793: while(isdigit(*start) && length > 0L)
794: {
795: val = val*10 + *start;
796: start++;
797: length--;
798: }
799:
800: return val;
801: }
802:
803: /* Vector used to translate characters for comparison.
804: This is how we make all alphanumerics follow all else,
805: and ignore case in the first sorting. */
806: int char_order[0x8000];
807:
808: void
809: init_char_order (void)
810: {
811: int i;
812:
813: for (i = 1; i < 0x8000; i++)
814: char_order[i] = i;
815:
816: for (i = '0'; i <= '9'; i++)
817: char_order[i] += 512;
818:
819: for (i = 'a'; i <= 'z'; i++)
820: {
821: char_order[i] = 512 + i;
822: char_order[i + 'A' - 'a'] = 512 + i;
823: }
824:
825: /* 以下のシンボルは無視 (i.e. char_order[XXX] = 0) */
826: for (i = 0x2121; i <= 0x2132; i++) /* 全角空白 〜 _ */
827: char_order[i] = 0;
828: for (i = 0x213D; i <= 0x215B; i++) /* − 〜 】 */
829: char_order[i] = 0;
830:
831: /* 促音、拗音、濁音、半濁音、片仮名/平仮名の処理 */
832: /* 1. 平仮名と片仮名、同じに扱う */
833: /* ぁ/ァ(2421/2521)〜ん/ン(2473/2573) */
834: for (i = 0x2421; i < 0x2474; i++)
835: char_order[i + 0x100] = i;
836:
837: /* 2. 促音、拗音、濁音、半濁音、類 */
838: /* ぁ〜ぉ → あ〜お */
839: for (i = 0x2421; i < 0x242B; i += 2)
840: char_order[i] = char_order[i + 0x100] = i+1;
841: /* が〜ぢ → か〜ち */
842: for (i = 0x242C; i < 0x2443; i += 2)
843: char_order[i] = char_order[i + 0x100] = i-1;
844: /* っ */
845: char_order[0x2443] = char_order[0x2543] = 0x2443+1;
846: /* づ〜ど → つ〜と */
847: for (i = 0x2445; i < 0x244A; i += 2)
848: char_order[i] = char_order[i + 0x100] = i-1;
849: /* ばぱ〜ぼぽ → は〜ほ */
850: for (i = 0x2450; i < 0x245E; i += 3)
851: {
852: char_order[i] = char_order[i + 0x100] = i-1;
853: char_order[i+1] = char_order[i+1 + 0x100] = i-1;
854: }
855: /* ゃ〜ょ → や〜よ */
856: for (i = 0x2463; i < 0x2469; i += 2)
857: char_order[i] = char_order[i + 0x100] = i+1;
858: /* ゎ → わ */
859: char_order[0x246E] = char_order[0x256E] = 0x246E +1;
860:
861: /* 3. 特殊片仮名の扱い */
862: char_order[0x2574] = 0x2426; /* ヴ → う */
863: char_order[0x2575] = 0x242B; /* ヵ → か */
864: char_order[0x2576] = 0x2431; /* ヶ → け */
865:
866: /* JISX0208のアルファベット、数字 */
867: for (i = 0x2330; i <= 0x2339; i++) /* 0〜9 -> 0-9 */
868: char_order[i] = i - 0x2330 + 512;
869: for (i = 0x2341; i <= 0x235A; i++) /* A〜Z、a〜z -> A-Z */
870: char_order[i] = char_order[i + (0x2361 - 0x2341)]
871: = 512 + (i - 0x2341) + 'a';
872:
873: /* 長音処理は比較ルーチンで行なう */
874:
875: init_Vowel_Table();
876: }
877:
878: /* Compare two fields (each specified as a start pointer and a character count)
879: according to KEYFIELD.
880: The sign of the value reports the relation between the fields. */
881:
882: int
883: compare_field (keyfield, start1, length1, pos1, start2, length2, pos2)
884: struct keyfield *keyfield;
885: u_short *start1;
886: long length1;
887: long pos1;
888: u_short *start2;
889: long length2;
890: long pos2;
891: {
892: if (keyfields->positional)
893: {
894: if (pos1 > pos2)
895: return 1;
896: else
897: return -1;
898: }
899: if (keyfield->numeric)
900: {
901: long value = find_value (start1, length1) - find_value (start2, length2);
902: if (value > 0)
903: return 1;
904: if (value < 0)
905: return -1;
906: return 0;
907: }
908: else
909: {
910: /* lexical ってか? */
911: u_short *p1 = start1;
912: u_short *p2 = start2;
913: u_short *e1 = start1 + length1;
914: u_short *e2 = start2 + length2;
915: u_short lastchar1 = 0, lastchar2 = 0;
916: int c1, c2;
917:
918: do
919: {
920: int co1, co2;
921:
922: /* char_order[X] が 0 のもの(シンボル)は無視する */
923: c1 = c2 = 0;
924: while (p1 < e1)
925: {
926: if (char_order[c1 = *p1++]) break;
927: c1 = 0;
928: }
929: while (p2 < e2)
930: {
931: if (char_order[c2 = *p2++]) break;
932: c2 = 0;
933: }
934:
935: /* 長音処理 */
936: if (c1 == 0x213C) /* ー */
937: c1 = Vowel_of(lastchar1);
938: if (c2 == 0x213C) /* ー */
939: c2 = Vowel_of(lastchar2);
940: co1 = char_order[c1];
941: co2 = char_order[c2];
942:
943: if (co1 != co2)
944: return co1 - co2;
945: else
946: lastchar1 = c1, lastchar2 = c2;
947:
948: } while(c1);
949:
950: /* Strings are equal except possibly for case. */
951: p1 = start1;
952: p2 = start2;
953: while (1)
954: {
955: int c1, c2;
956:
957: if (p1 == e1)
958: c1 = 0;
959: else
960: c1 = *p1++;
961: if (p2 == e2)
962: c2 = 0;
963: else
964: c2 = *p2++;
965:
966: if (c1 != c2)
967: /* Reverse sign here so upper case comes out last. */
968: return c2 - c1;
969: if (!c1)
970: break;
971: }
972:
973: return 0;
974: }
975: }
976:
977: /* A `struct linebuffer' is a structure which holds a line of text.
978: `readline' reads a line from a stream into a linebuffer
979: and works regardless of the length of the line. */
980:
981: struct linebuffer
982: {
983: long size;
984: u_short *buffer;
985: };
986:
987: /* Initialize LINEBUFFER for use. */
988:
989: void
990: initbuffer (linebuffer)
991: struct linebuffer *linebuffer;
992: {
993: linebuffer->size = 200;
994: linebuffer->buffer = (u_short *) xmalloc (200*sizeof(u_short));
995: }
996:
997: /* Read a line of text from STREAM into LINEBUFFER.
998: Return the length of the line. */
999:
1000: long
1001: readline (linebuffer, stream)
1002: struct linebuffer *linebuffer;
1003: FILE *stream;
1004: {
1005: u_short *buffer = linebuffer->buffer;
1006: u_short *p = linebuffer->buffer;
1007: u_short *end = p + linebuffer->size;
1008:
1009: while (1)
1010: {
1011: int c = getc (stream);
1012: if (p == end)
1013: {
1014: buffer =
1015: (u_short *) xrealloc (buffer,
1016: (linebuffer->size *= 2)*sizeof(u_short));
1017: p += buffer - linebuffer->buffer;
1018: end += buffer - linebuffer->buffer;
1019: linebuffer->buffer = buffer;
1020: }
1021: if (c < 0 || c == '\n')
1022: {
1023: *p = 0;
1024: break;
1025: }
1026: *p++ = c;
1027: }
1028:
1029: return p - buffer;
1030: }
1031:
1032: /* Sort an input file too big to sort in core. */
1033:
1034: void
1035: sort_offline (infile, nfiles, total, outfile)
1036: char *infile;
1037: int nfiles;
1038: long total;
1039: char *outfile;
1040: {
1041: /* More than enough. */
1042: int ntemps = 2 * (total + MAX_IN_CORE_SORT - 1) / MAX_IN_CORE_SORT;
1043: char **tempfiles = (char **) xmalloc (ntemps * sizeof (char *));
1044: FILE *istream = fopen (infile, "r");
1045: int i;
1046: struct linebuffer lb;
1047: long linelength;
1048: int failure = 0;
1049:
1050: initbuffer (&lb);
1051:
1052: /* Read in one line of input data. */
1053:
1054: linelength = readline (&lb, istream);
1055:
1056: if (lb.buffer[0] != '\\' && lb.buffer[0] != '@')
1057: {
1058: error ("%s: not a texinfo index file", infile);
1059: return;
1060: }
1061:
1062: /* Split up the input into `ntemps' temporary files, or maybe fewer,
1063: and put the new files' names into `tempfiles' */
1064:
1065: for (i = 0; i < ntemps; i++)
1066: {
1067: char *outname = maketempname (++tempcount);
1068: FILE *ostream = fopen (outname, "w");
1069: long tempsize = 0;
1070:
1071: if (!ostream)
1072: pfatal_with_name (outname);
1073: tempfiles[i] = outname;
1074:
1075: /* Copy lines into this temp file as long as it does not make file
1076: "too big" or until there are no more lines. */
1077:
1078: while (tempsize + linelength + 1 <= MAX_IN_CORE_SORT)
1079: {
1080: tempsize += linelength + 1;
1081: {
1082: u_short *tp;
1083:
1084: for (tp = lb.buffer; *tp; tp++)
1085: putc(*tp, ostream);
1086: }
1087: putc ('\n', ostream);
1088:
1089: /* Read another line of input data. */
1090:
1091: linelength = readline (&lb, istream);
1092: if (!linelength && feof (istream))
1093: break;
1094:
1095: if (lb.buffer[0] != '\\' && lb.buffer[0] != '@')
1096: {
1097: error ("%s: not a texinfo index file", infile);
1098: failure = 1;
1099: goto fail;
1100: }
1101: }
1102: fclose (ostream);
1103: if (feof (istream))
1104: break;
1105: }
1106:
1107: free (lb.buffer);
1108:
1109: fail:
1110: /* Record number of temp files we actually needed. */
1111:
1112: ntemps = i;
1113:
1114: /* Sort each tempfile into another tempfile.
1115: Delete the first set of tempfiles and put the names of the second
1116: into `tempfiles'. */
1117:
1118: for (i = 0; i < ntemps; i++)
1119: {
1120: char *newtemp = maketempname (++tempcount);
1121: sort_in_core (&tempfiles[i], MAX_IN_CORE_SORT, newtemp);
1122: if (!keep_tempfiles)
1123: unlink (tempfiles[i]);
1124: tempfiles[i] = newtemp;
1125: }
1126:
1127: if (failure)
1128: return;
1129:
1130: /* Merge the tempfiles together and indexify. */
1131:
1132: merge_files (tempfiles, ntemps, outfile);
1133: }
1134:
1135: /* Sort INFILE, whose size is TOTAL,
1136: assuming that is small enough to be done in-core,
1137: then indexify it and send the output to OUTFILE (or to stdout). */
1138:
1139: void
1140: sort_in_core (infile, total, outfile)
1141: char *infile;
1142: long total;
1143: char *outfile;
1144: {
1145: u_short **nextline;
1146: u_short *data = (u_short *) xmalloc ((total + 1)*sizeof(u_short));
1147: u_short *file_data;
1148: long file_size;
1149: int i;
1150: FILE *ostream = stdout;
1151: struct lineinfo *lineinfo;
1152: FILE *ifp;
1153: int c;
1154:
1155: /* Read the contents of the file into the moby array `data'. */
1156: ifp = fopen(infile, "r");
1157: if (!ifp)
1158: fatal ("failure reopening %s", infile);
1159: memset(data, 0, (total + 1)*sizeof(u_short)); /* zero clear */
1160: for (i = 0, c = fgetc(ifp); c != EOF; c = fgetc(ifp), i++)
1161: {
1162: if (EUC_BYTE(c))
1163: {
1164: data[i] = (c & 0x007F) << 8;
1165: c = fgetc(ifp) & 0x007F;
1166: }
1167: data[i] += c;
1168: }
1169: file_size = i;
1170: file_data = data;
1171: data[file_size] = 0;
1172:
1173: fclose(ifp);
1174:
1175: if (file_size > 0 && data[0] != '\\' && data[0] != '@')
1176: {
1177: error ("%s: not a texinfo index file", infile);
1178: return;
1179: }
1180:
1181: init_char_order ();
1182:
1183: /* Sort routines want to know this address. */
1184:
1185: text_base = data;
1186:
1187: /* Create the array of pointers to lines, with a default size
1188: frequently enough. */
1189:
1190: nlines = total / 50;
1191: if (!nlines)
1192: nlines = 2;
1193: linearray = (u_short **) xmalloc (nlines * sizeof (u_short *));
1194:
1195: /* `nextline' points to the next free slot in this array.
1196: `nlines' is the allocated size. */
1197:
1198: nextline = linearray;
1199:
1200: /* Parse the input file's data, and make entries for the lines. */
1201:
1202: nextline = parsefile (nextline, file_data, file_size);
1203:
1204: if (nextline == 0)
1205: {
1206: error ("%s: not a texinfo index file", infile);
1207: return;
1208: }
1209:
1210: /* Sort the lines. */
1211:
1212: /* If we have enough space, find the first keyfield of each line in advance.
1213: Make a `struct lineinfo' for each line, which records the keyfield
1214: as well as the line, and sort them. */
1215:
1216: lineinfo = (struct lineinfo *) malloc ((nextline - linearray) * sizeof (struct lineinfo));
1217:
1218: if (lineinfo)
1219: {
1220: struct lineinfo *lp;
1221: u_short **p;
1222:
1223: for (lp = lineinfo, p = linearray; p != nextline; lp++, p++)
1224: {
1225: lp->text = *p;
1226: lp->key.text = find_field (keyfields, *p, &lp->keylen);
1227: if (keyfields->numeric)
1228: lp->key.number = find_value (lp->key.text, lp->keylen);
1229: }
1230:
1231: qsort (lineinfo, nextline - linearray, sizeof (struct lineinfo), compare_prepared);
1232:
1233: for (lp = lineinfo, p = linearray; p != nextline; lp++, p++)
1234: *p = lp->text;
1235:
1236: free (lineinfo);
1237: }
1238: else
1239: qsort (linearray, nextline - linearray, sizeof (u_short *), compare_full);
1240:
1241: /* Open the output file. */
1242:
1243: if (outfile)
1244: {
1245: ostream = fopen (outfile, "w");
1246: if (!ostream)
1247: pfatal_with_name (outfile);
1248: }
1249:
1250: writelines (linearray, nextline - linearray, ostream);
1251: if (outfile)
1252: fclose (ostream);
1253:
1254: free (linearray);
1255: free (data);
1256: }
1257:
1258: /* Parse an input string in core into lines.
1259: DATA is the input string, and SIZE is its length.
1260: Data goes in LINEARRAY starting at NEXTLINE.
1261: The value returned is the first entry in LINEARRAY still unused.
1262: Value 0 means input file contents are invalid. */
1263:
1264: u_short **
1265: parsefile (nextline, data, size)
1266: u_short **nextline;
1267: u_short *data;
1268: long size;
1269: {
1270: u_short *p, *end;
1271: u_short **line = nextline;
1272:
1273: p = data;
1274: end = p + size;
1275: *end = 0;
1276:
1277: while (p != end)
1278: {
1279: if (p[0] != '\\' && p[0] != '@')
1280: return 0;
1281:
1282: *line = p;
1283: while (*p && *p != '\n')
1284: p++;
1285: if (p != end)
1286: p++;
1287:
1288: line++;
1289: if (line == linearray + nlines)
1290: {
1291: u_short **old = linearray;
1292: linearray =
1293: (u_short **) xrealloc (linearray,
1294: sizeof (u_short *) * (nlines *= 4));
1295: line += linearray - old;
1296: }
1297: }
1298:
1299: return line;
1300: }
1301:
1302: /* Indexification is a filter applied to the sorted lines
1303: as they are being written to the output file.
1304: Multiple entries for the same name, with different page numbers,
1305: get combined into a single entry with multiple page numbers.
1306: The first braced field, which is used for sorting, is discarded.
1307: However, its first character is examined, folded to lower case,
1308: and if it is different from that in the previous line fed to us
1309: a \initial line is written with one argument, the new initial.
1310:
1311: If an entry has four braced fields, then the second and third
1312: constitute primary and secondary names.
1313: In this case, each change of primary name
1314: generates a \primary line which contains only the primary name,
1315: and in between these are \secondary lines which contain
1316: just a secondary name and page numbers. */
1317:
1318: /* The last primary name we wrote a \primary entry for.
1319: If only one level of indexing is being done, this is the last name seen. */
1320: u_short *lastprimary;
1321: /* Length of storage allocated for lastprimary. */
1322: int lastprimarylength;
1323:
1324: /* Similar, for the secondary name. */
1325: u_short *lastsecondary;
1326: int lastsecondarylength;
1327:
1328: /* Zero if we are not in the middle of writing an entry.
1329: One if we have written the beginning of an entry but have not
1330: yet written any page numbers into it.
1331: Greater than one if we have written the beginning of an entry
1332: plus at least one page number. */
1333: int pending;
1334:
1335: /* The initial (for sorting purposes) of the last primary entry written.
1336: When this changes, a \initial {c} line is written */
1337:
1338: u_short *lastinitial;
1339:
1340: int lastinitiallength;
1341:
1342: /* When we need a string of length 1 for the value of lastinitial,
1343: store it here. */
1344:
1345: u_short lastinitial1[2];
1346:
1347: /* Initialize static storage for writing an index. */
1348:
1349: void
1350: init_index ()
1351: {
1352: pending = 0;
1353: lastinitial = lastinitial1;
1354: lastinitial1[0] = 0;
1355: lastinitial1[1] = 0;
1356: lastinitiallength = 0;
1357: lastprimarylength = 100;
1358: lastprimary = (u_short *) xmalloc ((lastprimarylength + 1)*sizeof(u_short));
1359: memset (lastprimary, '\0', (lastprimarylength + 1)*sizeof(u_short));
1360: lastsecondarylength = 100;
1361: lastsecondary =
1362: (u_short *) xmalloc ((lastsecondarylength + 1)*sizeof(u_short));
1363: memset (lastsecondary, '\0', (lastsecondarylength + 1)*sizeof(u_short));
1364: }
1365:
1366: /* Indexify. Merge entries for the same name,
1367: insert headers for each initial character, etc. */
1368:
1369: void
1370: indexify (u_short *line, FILE *ostream)
1371: {
1372: u_short *primary, *secondary, *pagenumber;
1373: int primarylength, secondarylength = 0, pagelength;
1374: int nosecondary;
1375: int initiallength;
1376: u_short *initial;
1377: u_short initial1[2];
1378: register u_short *p;
1379:
1380: /* First, analyze the parts of the entry fed to us this time. */
1381:
1382: p = find_braced_pos (line, 0, 0, 0);
1383: if (*p == '{')
1384: {
1385: initial = p;
1386: /* Get length of inner pair of braces starting at `p',
1387: including that inner pair of braces. */
1388: initiallength = find_braced_end (p + 1) + 1 - p;
1389: }
1390: else
1391: {
1392: initial = initial1;
1393: initial1[0] = *p & 0x7F7F;
1394: initial1[1] = 0;
1395: initiallength = 1;
1396:
1397: if (initial1[0] >= 'a' && initial1[0] <= 'z')
1398: initial1[0] -= ' ';
1399: else if (initial1[0] > 0x2000)
1400: initial1[0] = char_order[initial1[0]];
1401: }
1402:
1403: pagenumber = find_braced_pos (line, 1, 0, 0);
1404: pagelength = find_braced_end (pagenumber) - pagenumber;
1405: if (pagelength == 0)
1406: abort ();
1407:
1408: primary = find_braced_pos (line, 2, 0, 0);
1409: primarylength = find_braced_end (primary) - primary;
1410:
1411: secondary = find_braced_pos (line, 3, 0, 0);
1412: nosecondary = !*secondary;
1413: if (!nosecondary)
1414: secondarylength = find_braced_end (secondary) - secondary;
1415:
1416: /* If the primary is different from before, make a new primary entry. */
1417: if (wstrncmp (primary, lastprimary, primarylength))
1418: {
1419: /* Close off current secondary entry first, if one is open. */
1420: if (pending)
1421: {
1422: fputs ("}\n", ostream);
1423: pending = 0;
1424: }
1425:
1426: /* If this primary has a different initial, include an entry for
1427: the initial. */
1428: if (initiallength != lastinitiallength ||
1429: (*initial && wstrncmp (initial, lastinitial, initiallength)))
1430: {
1431: fprintf (ostream, "\\initial {");
1432: wfwrite (initial, initiallength, 1, ostream);
1433: fprintf (ostream, "}\n", initial);
1434: if (initial == initial1)
1435: {
1436: lastinitial = lastinitial1;
1437: *lastinitial1 = *initial1;
1438: }
1439: else
1440: {
1441: lastinitial = initial;
1442: }
1443: lastinitiallength = initiallength;
1444: }
1445:
1446: /* Make the entry for the primary. */
1447: if (nosecondary)
1448: fputs ("\\entry {", ostream);
1449: else
1450: fputs ("\\primary {", ostream);
1451: wfwrite (primary, primarylength, 1, ostream);
1452: if (nosecondary)
1453: {
1454: fputs ("}{", ostream);
1455: pending = 1;
1456: }
1457: else
1458: fputs ("}\n", ostream);
1459:
1460: /* Record name of most recent primary. */
1461: if (lastprimarylength < primarylength)
1462: {
1463: lastprimarylength = primarylength + 100;
1464: lastprimary =
1465: (u_short *) xrealloc (lastprimary,
1466: (1 + lastprimarylength)*sizeof(u_short));
1467: }
1468: memcpy (lastprimary, primary, primarylength*sizeof(u_short));
1469: lastprimary[primarylength] = 0;
1470:
1471: /* There is no current secondary within this primary, now. */
1472: lastsecondary[0] = 0;
1473: }
1474:
1475: /* Should not have an entry with no subtopic following one with a subtopic. */
1476:
1477: if (nosecondary && *lastsecondary)
1478: {
1479: char tline[2001];
1480: int len;
1481:
1482: len = wtoc(tline, line, 2000); /* 2000 is much enough ? */
1483: tline[len] = '\0';
1484: error ("entry %s follows an entry with a secondary name", tline);
1485: }
1486:
1487: /* Start a new secondary entry if necessary. */
1488: if (!nosecondary && wstrncmp (secondary, lastsecondary, secondarylength))
1489: {
1490: if (pending)
1491: {
1492: fputs ("}\n", ostream);
1493: pending = 0;
1494: }
1495:
1496: /* Write the entry for the secondary. */
1497: fputs ("\\secondary {", ostream);
1498: wfwrite (secondary, secondarylength, 1, ostream);
1499: fputs ("}{", ostream);
1500: pending = 1;
1501:
1502: /* Record name of most recent secondary. */
1503: if (lastsecondarylength < secondarylength)
1504: {
1505: lastsecondarylength = secondarylength + 100;
1506: lastsecondary =
1507: (u_short *) xrealloc ((lastsecondary,
1508: 1 + lastsecondarylength)*sizeof(u_short));
1509: }
1510: memcpy (lastsecondary, secondary, secondarylength * sizeof(u_short));
1511: lastsecondary[secondarylength] = 0;
1512: }
1513:
1514: /* Here to add one more page number to the current entry. */
1515: if (pending++ != 1)
1516: fputs (", ", ostream); /* Punctuate first, if this is not the first. */
1517: wfwrite (pagenumber, pagelength, 1, ostream);
1518: }
1519:
1520: /* Close out any unfinished output entry. */
1521:
1522: void
1523: finish_index (ostream)
1524: FILE *ostream;
1525: {
1526: if (pending)
1527: fputs ("}\n", ostream);
1528: free (lastprimary);
1529: free (lastsecondary);
1530: }
1531:
1532: /* Copy the lines in the sorted order.
1533: Each line is copied out of the input file it was found in. */
1534:
1535: void
1536: writelines (linearray, nlines, ostream)
1537: u_short **linearray;
1538: int nlines;
1539: FILE *ostream;
1540: {
1541: u_short **stop_line = linearray + nlines;
1542: u_short **next_line;
1543:
1544: init_index ();
1545:
1546: /* Output the text of the lines, and free the buffer space. */
1547:
1548: for (next_line = linearray; next_line != stop_line; next_line++)
1549: {
1550: /* If -u was specified, output the line only if distinct from previous one. */
1551: if (next_line == linearray
1552: /* Compare previous line with this one, using only the
1553: explicitly specd keyfields. */
1554: || compare_general (*(next_line - 1), *next_line, 0L, 0L, num_keyfields - 1))
1555: {
1556: u_short *p = *next_line;
1557: u_short c;
1558:
1559: while ((c = *p++) && c != '\n')
1560: /* Do nothing. */ ;
1561: *(p - 1) = 0;
1562: indexify (*next_line, ostream);
1563: }
1564: }
1565:
1566: finish_index (ostream);
1567: }
1568:
1569: /* Assume (and optionally verify) that each input file is sorted;
1570: merge them and output the result.
1571: Returns nonzero if any input file fails to be sorted.
1572:
1573: This is the high-level interface that can handle an unlimited
1574: number of files. */
1575:
1576: #define MAX_DIRECT_MERGE 10
1577:
1578: int
1579: merge_files (infiles, nfiles, outfile)
1580: char **infiles;
1581: int nfiles;
1582: char *outfile;
1583: {
1584: char **tempfiles;
1585: int ntemps;
1586: int i;
1587: int value = 0;
1588: int start_tempcount = tempcount;
1589:
1590: if (nfiles <= MAX_DIRECT_MERGE)
1591: return merge_direct (infiles, nfiles, outfile);
1592:
1593: /* Merge groups of MAX_DIRECT_MERGE input files at a time,
1594: making a temporary file to hold each group's result. */
1595:
1596: ntemps = (nfiles + MAX_DIRECT_MERGE - 1) / MAX_DIRECT_MERGE;
1597: tempfiles = (char **) xmalloc (ntemps * sizeof (char *));
1598: for (i = 0; i < ntemps; i++)
1599: {
1600: int nf = MAX_DIRECT_MERGE;
1601: if (i + 1 == ntemps)
1602: nf = nfiles - i * MAX_DIRECT_MERGE;
1603: tempfiles[i] = maketempname (++tempcount);
1604: value |= merge_direct (&infiles[i * MAX_DIRECT_MERGE], nf, tempfiles[i]);
1605: }
1606:
1607: /* All temporary files that existed before are no longer needed
1608: since their contents have been merged into our new tempfiles.
1609: So delete them. */
1610: flush_tempfiles (start_tempcount);
1611:
1612: /* Now merge the temporary files we created. */
1613:
1614: merge_files (tempfiles, ntemps, outfile);
1615:
1616: free (tempfiles);
1617:
1618: return value;
1619: }
1620:
1621: /* Assume (and optionally verify) that each input file is sorted;
1622: merge them and output the result.
1623: Returns nonzero if any input file fails to be sorted.
1624:
1625: This version of merging will not work if the number of
1626: input files gets too high. Higher level functions
1627: use it only with a bounded number of input files. */
1628:
1629: int
1630: merge_direct (infiles, nfiles, outfile)
1631: char **infiles;
1632: int nfiles;
1633: char *outfile;
1634: {
1635: struct linebuffer *lb1, *lb2;
1636: struct linebuffer **thisline, **prevline;
1637: FILE **streams;
1638: int i;
1639: int nleft;
1640: int lossage = 0;
1641: int *file_lossage;
1642: struct linebuffer *prev_out = 0;
1643: FILE *ostream = stdout;
1644:
1645: if (outfile)
1646: {
1647: ostream = fopen (outfile, "w");
1648: }
1649: if (!ostream)
1650: pfatal_with_name (outfile);
1651:
1652: init_index ();
1653:
1654: if (nfiles == 0)
1655: {
1656: if (outfile)
1657: fclose (ostream);
1658: return 0;
1659: }
1660:
1661: /* For each file, make two line buffers.
1662: Also, for each file, there is an element of `thisline'
1663: which points at any time to one of the file's two buffers,
1664: and an element of `prevline' which points to the other buffer.
1665: `thisline' is supposed to point to the next available line from the file,
1666: while `prevline' holds the last file line used,
1667: which is remembered so that we can verify that the file is properly sorted. */
1668:
1669: /* lb1 and lb2 contain one buffer each per file. */
1670: lb1 = (struct linebuffer *) xmalloc (nfiles * sizeof (struct linebuffer));
1671: lb2 = (struct linebuffer *) xmalloc (nfiles * sizeof (struct linebuffer));
1672:
1673: /* thisline[i] points to the linebuffer holding the next available line in file i,
1674: or is zero if there are no lines left in that file. */
1675: thisline = (struct linebuffer **)
1676: xmalloc (nfiles * sizeof (struct linebuffer *));
1677: /* prevline[i] points to the linebuffer holding the last used line
1678: from file i. This is just for verifying that file i is properly
1679: sorted. */
1680: prevline = (struct linebuffer **)
1681: xmalloc (nfiles * sizeof (struct linebuffer *));
1682: /* streams[i] holds the input stream for file i. */
1683: streams = (FILE **) xmalloc (nfiles * sizeof (FILE *));
1684: /* file_lossage[i] is nonzero if we already know file i is not
1685: properly sorted. */
1686: file_lossage = (int *) xmalloc (nfiles * sizeof (int));
1687:
1688: /* Allocate and initialize all that storage. */
1689:
1690: for (i = 0; i < nfiles; i++)
1691: {
1692: initbuffer (&lb1[i]);
1693: initbuffer (&lb2[i]);
1694: thisline[i] = &lb1[i];
1695: prevline[i] = &lb2[i];
1696: file_lossage[i] = 0;
1697: streams[i] = fopen (infiles[i], "r");
1698: if (!streams[i])
1699: pfatal_with_name (infiles[i]);
1700:
1701: readline (thisline[i], streams[i]);
1702: }
1703:
1704: /* Keep count of number of files not at eof. */
1705: nleft = nfiles;
1706:
1707: while (nleft)
1708: {
1709: struct linebuffer *best = 0;
1710: struct linebuffer *exch;
1711: int bestfile = -1;
1712: int i;
1713:
1714: /* Look at the next avail line of each file; choose the least one. */
1715:
1716: for (i = 0; i < nfiles; i++)
1717: {
1718: if (thisline[i] &&
1719: (!best ||
1720: 0 < compare_general (best->buffer, thisline[i]->buffer,
1721: (long) bestfile, (long) i, num_keyfields)))
1722: {
1723: best = thisline[i];
1724: bestfile = i;
1725: }
1726: }
1727:
1728: /* Output that line, unless it matches the previous one and we
1729: don't want duplicates. */
1730:
1731: if (!(prev_out &&
1732: !compare_general (prev_out->buffer,
1733: best->buffer, 0L, 1L, num_keyfields - 1)))
1734: indexify (best->buffer, ostream);
1735: prev_out = best;
1736:
1737: /* Now make the line the previous of its file, and fetch a new
1738: line from that file. */
1739:
1740: exch = prevline[bestfile];
1741: prevline[bestfile] = thisline[bestfile];
1742: thisline[bestfile] = exch;
1743:
1744: while (1)
1745: {
1746: /* If the file has no more, mark it empty. */
1747:
1748: if (feof (streams[bestfile]))
1749: {
1750: thisline[bestfile] = 0;
1751: /* Update the number of files still not empty. */
1752: nleft--;
1753: break;
1754: }
1755: readline (thisline[bestfile], streams[bestfile]);
1756: if (thisline[bestfile]->buffer[0] || !feof (streams[bestfile]))
1757: break;
1758: }
1759: }
1760:
1761: finish_index (ostream);
1762:
1763: /* Free all storage and close all input streams. */
1764:
1765: for (i = 0; i < nfiles; i++)
1766: {
1767: fclose (streams[i]);
1768: free (lb1[i].buffer);
1769: free (lb2[i].buffer);
1770: }
1771: free (file_lossage);
1772: free (lb1);
1773: free (lb2);
1774: free (thisline);
1775: free (prevline);
1776: free (streams);
1777:
1778: if (outfile)
1779: fclose (ostream);
1780:
1781: return lossage;
1782: }
1783:
1784: /* Print error message and exit. */
1785:
1786: void
1787: fatal (format, arg)
1788: char *format, *arg;
1789: {
1790: error (format, arg);
1791: exit (TI_FATAL_ERROR);
1792: }
1793:
1794: /* Print error message. FORMAT is printf control string, ARG is arg for it. */
1795: void
1796: error (format, arg)
1797: char *format, *arg;
1798: {
1799: printf ("%s: ", program_name);
1800: printf (format, arg);
1801: if (format[strlen (format) -1] != '\n')
1802: printf ("\n");
1803: }
1804:
1805: void
1806: perror_with_name (name)
1807: char *name;
1808: {
1809: char *s;
1810:
1811: s = strerror (errno);
1812: printf ("%s: ", program_name);
1813: printf ("%s; for file `%s'.\n", s, name);
1814: }
1815:
1816: void
1817: pfatal_with_name (name)
1818: char *name;
1819: {
1820: char *s;
1821:
1822: s = strerror (errno);
1823: printf ("%s: ", program_name);
1824: printf ("%s; for file `%s'.\n", s, name);
1825: exit (TI_FATAL_ERROR);
1826: }
1827:
1828: /* Return a newly-allocated string whose contents concatenate those of
1829: S1, S2, S3. */
1830:
1831: char *
1832: concat (s1, s2, s3)
1833: char *s1, *s2, *s3;
1834: {
1835: int len1 = strlen (s1), len2 = strlen (s2), len3 = strlen (s3);
1836: char *result = (char *) xmalloc (len1 + len2 + len3 + 1);
1837:
1838: strcpy (result, s1);
1839: strcpy (result + len1, s2);
1840: strcpy (result + len1 + len2, s3);
1841: *(result + len1 + len2 + len3) = 0;
1842:
1843: return result;
1844: }
1845:
1846: #if !defined (HAVE_STRERROR)
1847: extern char *sys_errlist[];
1848: extern int sys_nerr;
1849:
1850: char *
1851: strerror (num)
1852: int num;
1853: {
1854: if (num >= sys_nerr)
1855: return ("");
1856: else
1857: return (sys_errlist[num]);
1858: }
1859: #endif /* !HAVE_STRERROR */
1860:
1861: #if !defined (HAVE_STRCHR)
1862: char *
1863: strrchr (string, character)
1864: char *string;
1865: int character;
1866: {
1867: register int i;
1868:
1869: for (i = strlen (string) - 1; i > -1; i--)
1870: if (string[i] == character)
1871: return (string + i);
1872:
1873: return ((char *)NULL);
1874: }
1875: #endif /* HAVE_STRCHR */
1876:
1877: /* Just like malloc, but kills the program in case of fatal error. */
1878: void *
1879: xmalloc (nbytes)
1880: int nbytes;
1881: {
1882: void *temp = (void *) malloc (nbytes);
1883:
1884: if (nbytes && temp == (void *)NULL)
1885: memory_error ("xmalloc", nbytes);
1886:
1887: return (temp);
1888: }
1889:
1890: /* Like realloc (), but barfs if there isn't enough memory. */
1891: void *
1892: xrealloc (pointer, nbytes)
1893: void *pointer;
1894: int nbytes;
1895: {
1896: void *temp;
1897:
1898: if (!pointer)
1899: temp = (void *)xmalloc (nbytes);
1900: else
1901: temp = (void *)realloc (pointer, nbytes);
1902:
1903: if (nbytes && !temp)
1904: memory_error ("xrealloc", nbytes);
1905:
1906: return (temp);
1907: }
1908:
1909: void
1910: memory_error (callers_name, bytes_wanted)
1911: char *callers_name;
1912: int bytes_wanted;
1913: {
1914: char printable_string[80];
1915:
1916: sprintf (printable_string,
1917: "Virtual memory exhausted in %s ()! Needed %d bytes.",
1918: callers_name, bytes_wanted);
1919:
1920: error (printable_string);
1921: abort();
1922: }
1923:
1924:
1925: static int
1926: wstrncmp(const u_short *s1, const u_short *s2, size_t n)
1927: {
1928: int i;
1929: int d = 0;
1930:
1931: for (i=0; i<n && d == 0; i++)
1932: d = s1[i] - s2[i];
1933:
1934: return d;
1935: }
1936:
1937:
1938: static int
1939: wfwrite(u_short *ptr, size_t size, size_t nmemb, FILE*stream)
1940: {
1941: char *cptr;
1942: int ret, len;
1943:
1944: cptr = (char *)xmalloc(size * nmemb * sizeof(u_short)); /* 丼一杯! */
1945: len = wtoc(cptr, ptr, size*nmemb);
1946: assert(len <= (size * nmemb * sizeof(u_short)));
1947:
1948: ret = fwrite(cptr, len, nmemb, stream);
1949: free(cptr);
1950: return ret;
1951: }
1952:
1953:
1954: static int
1955: wtoc(char *cptr, u_short const *ptr, int len)
1956: {
1957: char *tptr;
1958: u_short c;
1959: int i;
1960:
1961: tptr = cptr;
1962: for (i = 0; i < len; i++)
1963: {
1964: c = ptr[i];
1965: if (G1_CHAR(c))
1966: {
1967: *tptr++ = c >> 8 | 0x0080; /* to EUC */
1968: c &= 0x007F;
1969: c |= 0x0080;
1970: }
1971: *tptr++ = c;
1972: }
1973:
1974: return tptr - cptr;
1975: }
1976:
1977:
1978: static int
1979: Vowel_of(int c)
1980: {
1981: if (0x2521 <= c && c < 0x2577) /* 片仮名は */
1982: c -= 0x100; /* 平仮名へ */
1983: /* ヴ、ヵ、ヶ、の3字は相当する平仮名はないが、コードがあると仮定する */
1984: /* 普通は問題にならない筈。 */
1985:
1986: if (0x2421 <= c && c < 0x2474)
1987: return Vowel_Table[c - 0x2420];
1988: else
1989: return c; /* 平仮名以外はそのまま */
1990: }
1991:
1992:
1993: static void
1994: init_Vowel_Table(void)
1995: {
1996: int i;
1997:
1998: /* あ〜ぞ */
1999: for (i = 0x2421 - 0x2420; i < 0x2445 - 0x2420; i += 10)
2000: {
2001: Vowel_Table[i] = Vowel_Table[i+1] = 0x2422; /* あ段 */
2002: Vowel_Table[i+2] = Vowel_Table[i+3] = 0x2424; /* い段 */
2003: Vowel_Table[i+4] = Vowel_Table[i+5] = 0x2426; /* う段 */
2004: Vowel_Table[i+6] = Vowel_Table[i+7] = 0x2428; /* え段 */
2005: Vowel_Table[i+8] = Vowel_Table[i+9] = 0x242A; /* お段 */
2006: }
2007: Vowel_Table[0x243F - 0x2420] = 0x2422; /* た */
2008: Vowel_Table[0x2440 - 0x2420] = 0x2422; /* だ */
2009: Vowel_Table[0x2441 - 0x2420] = 0x2424; /* ち */
2010: Vowel_Table[0x2442 - 0x2420] = 0x2424; /* ぢ */
2011: Vowel_Table[0x2443 - 0x2420] = 0x2426; /* っ */
2012: Vowel_Table[0x2444 - 0x2420] = 0x2426; /* つ */
2013: Vowel_Table[0x2445 - 0x2420] = 0x2426; /* づ */
2014: Vowel_Table[0x2446 - 0x2420] = 0x2428; /* て */
2015: Vowel_Table[0x2447 - 0x2420] = 0x2428; /* で */
2016: Vowel_Table[0x2448 - 0x2420] = 0x242A; /* と */
2017: Vowel_Table[0x2449 - 0x2420] = 0x242A; /* ど */
2018: Vowel_Table[0x244A - 0x2420] = 0x2422; /* な */
2019: Vowel_Table[0x244B - 0x2420] = 0x2424; /* に */
2020: Vowel_Table[0x244C - 0x2420] = 0x2426; /* ぬ */
2021: Vowel_Table[0x244D - 0x2420] = 0x2428; /* ね */
2022: Vowel_Table[0x244E - 0x2420] = 0x242A; /* の */
2023: /* は〜ぽ */
2024: for (i = 0x244F - 0x2420; i < 0x245E - 0x2420; i += 15)
2025: {
2026: Vowel_Table[i] = Vowel_Table[i+1] = Vowel_Table[i+2] = 0x2422;
2027: Vowel_Table[i+3] = Vowel_Table[i+4] = Vowel_Table[i+5] = 0x2424;
2028: Vowel_Table[i+6] = Vowel_Table[i+7] = Vowel_Table[i+8] = 0x2426;
2029: Vowel_Table[i+9] = Vowel_Table[i+11] = Vowel_Table[i+12] = 0x2428;
2030: Vowel_Table[i+12] = Vowel_Table[i+13] = Vowel_Table[i+14] = 0x242A;
2031: }
2032: Vowel_Table[0x245E - 0x2420] = 0x2422; /* ま */
2033: Vowel_Table[0x245F - 0x2420] = 0x2424; /* み */
2034: Vowel_Table[0x2460 - 0x2420] = 0x2426; /* む */
2035: Vowel_Table[0x2461 - 0x2420] = 0x2428; /* め */
2036: Vowel_Table[0x2462 - 0x2420] = 0x242A; /* も */
2037: Vowel_Table[0x2463 - 0x2420] = 0x2422; /* ゃ */
2038: Vowel_Table[0x2464 - 0x2420] = 0x2422; /* や */
2039: Vowel_Table[0x2465 - 0x2420] = 0x2426; /* ゅ */
2040: Vowel_Table[0x2466 - 0x2420] = 0x2426; /* ゆ */
2041: Vowel_Table[0x2467 - 0x2420] = 0x242A; /* ょ */
2042: Vowel_Table[0x2468 - 0x2420] = 0x242A; /* よ */
2043: Vowel_Table[0x2469 - 0x2420] = 0x2422; /* ら */
2044: Vowel_Table[0x246A - 0x2420] = 0x2424; /* り */
2045: Vowel_Table[0x246B - 0x2420] = 0x2426; /* る */
2046: Vowel_Table[0x246C - 0x2420] = 0x2428; /* れ */
2047: Vowel_Table[0x246D - 0x2420] = 0x242A; /* ろ */
2048: Vowel_Table[0x246E - 0x2420] = 0x2422; /* ゎ */
2049: Vowel_Table[0x246F - 0x2420] = 0x2422; /* わ */
2050: /* ゐ〜ヶ は、そのまま。母音への変換はしない。 */
2051: for (i = 0x2470 - 0x2420; i < 0x2477 - 0x2420; i++)
2052: Vowel_Table[i] = i;
2053: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>