Annotation of OpenXM/src/asir-doc/jtexindex/C/texindex.c, Revision 1.1.1.1
1.1 noro 1: /* Prepare TeX index dribble output into an actual index.
2:
3: Version 1.45
4:
5: Copyright (C) 1987, 1991, 1992 Free Software Foundation, Inc.
6:
7: This program is free software; you can redistribute it and/or modify
8: it under the terms of the GNU General Public License as published by
9: the Free Software Foundation; either version 2, or (at your option)
10: any later version.
11:
12: This program is distributed in the hope that it will be useful,
13: but WITHOUT ANY WARRANTY; without even the implied warranty of
14: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15: GNU General Public License for more details.
16:
17: You should have received a copy of the GNU General Public License
18: along with this program; if not, write to the Free Software
19: Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307. */
20:
21: /*
22: 95.10.18 modified by Nobuyosi KAMEI <CXK03514@niftyserve.or.jp>
23: 日本語EUCが処理できるようにした。
24: 95.10.19 modified by K. Handa <handa@etl.go.jp>
25: \initial{...} の出力を調整。
26: */
27:
28:
29: #include <stdio.h>
30: #include <ctype.h>
31: #include <errno.h>
32: #include <assert.h>
33: #include "getopt.h"
34:
35: #define TEXINDEX_VERSION_STRING "GNU Texindex 2.0 for Texinfo release 3.4"
36: #define TEXINDEX_MODIFICATION_STRING "(modified for Japanese EUC encoding)"
37:
38: #if defined (emacs)
39: # include "../src/config.h"
40: /* Some s/os.h files redefine these. */
41: # undef read
42: # undef close
43: # undef write
44: # undef open
45: #endif
46:
47: #if defined (HAVE_STRING_H)
48: # include <string.h>
49: #endif /* HAVE_STRING_H */
50:
51: #if !defined (HAVE_STRCHR)
52: char *strrchr ();
53: #endif /* !HAVE_STRCHR */
54:
55: #if defined (STDC_HEADERS)
56: # include <stdlib.h>
57: #else /* !STDC_HEADERS */
58: char *getenv (), *malloc (), *realloc ();
59: #endif /* !STDC_HEADERS */
60:
61: #if defined (HAVE_UNISTD_H)
62: # include <unistd.h>
63: #else /* !HAVE_UNISTD_H */
64: off_t lseek ();
65: #endif /* !HAVE_UNISTD_H */
66:
67: #if !defined (HAVE_MEMSET)
68: #undef memset
69: #define memset(ptr, ignore, count) bzero (ptr, count)
70: #endif
71:
72:
73: char *mktemp ();
74:
75: #if defined (VMS)
76: # include <file.h>
77: # define TI_NO_ERROR ((1 << 28) | 1)
78: # define TI_FATAL_ERROR ((1 << 28) | 4)
79: # define unlink delete
80: #else /* !VMS */
81: # if defined (HAVE_SYS_FCNTL_H)
82: # include <sys/types.h>
83: # include <sys/fcntl.h>
84: # endif /* HAVE_SYS_FCNTL_H */
85:
86: # if defined (_AIX) || !defined (_POSIX_VERSION)
87: # include <sys/file.h>
88: # else /* !AIX && _POSIX_VERSION */
89: # if !defined (HAVE_SYS_FCNTL_H)
90: # include <fcntl.h>
91: # endif /* !HAVE_FCNTL_H */
92: # endif /* !_AIX && _POSIX_VERSION */
93: # define TI_NO_ERROR 0
94: # define TI_FATAL_ERROR 1
95: #endif /* !VMS */
96:
97: #if !defined (SEEK_SET)
98: # define SEEK_SET 0
99: # define SEEK_CUR 1
100: # define SEEK_END 2
101: #endif /* !SEEK_SET */
102:
103: #if !defined (errno)
104: extern int errno;
105: #endif
106: char *strerror ();
107:
108: /* When sorting in core, this structure describes one line
109: and the position and length of its first keyfield. */
110: struct lineinfo
111: {
112: u_short *text; /* The actual text of the line. */
113: union {
114: u_short *text; /* The start of the key (for textual comparison). */
115: long number; /* The numeric value (for numeric comparison). */
116: } key;
117: long keylen; /* Length of KEY field. */
118: };
119:
120: /* This structure describes a field to use as a sort key. */
121: struct keyfield
122: {
123: int startwords; /* Number of words to skip. */
124: int startchars; /* Number of additional chars to skip. */
125: int endwords; /* Number of words to ignore at end. */
126: int endchars; /* Ditto for characters of last word. */
127: char ignore_blanks; /* Non-zero means ignore spaces and tabs. */
128: char fold_case; /* Non-zero means case doesn't matter. */
129: char reverse; /* Non-zero means compare in reverse order. */
130: char numeric; /* Non-zeros means field is ASCII numeric. */
131: char positional; /* Sort according to file position. */
132: char braced; /* Count balanced-braced groupings as fields. */
133: };
134:
135: /* Vector of keyfields to use. */
136: struct keyfield keyfields[3];
137:
138: /* Number of keyfields stored in that vector. */
139: int num_keyfields = 3;
140:
141: /* Vector of input file names, terminated with a null pointer. */
142: char **infiles;
143:
144: /* Vector of corresponding output file names, or NULL, meaning default it
145: (add an `s' to the end). */
146: char **outfiles;
147:
148: /* Length of `infiles'. */
149: int num_infiles;
150:
151: /* Pointer to the array of pointers to lines being sorted. */
152: u_short **linearray;
153:
154: /* The allocated length of `linearray'. */
155: long nlines;
156:
157: /* Directory to use for temporary files. On Unix, it ends with a slash. */
158: char *tempdir;
159:
160: /* Start of filename to use for temporary files. */
161: char *tempbase;
162:
163: /* Number of last temporary file. */
164: int tempcount;
165:
166: /* Number of last temporary file already deleted.
167: Temporary files are deleted by `flush_tempfiles' in order of creation. */
168: int last_deleted_tempcount;
169:
170: /* During in-core sort, this points to the base of the data block
171: which contains all the lines of data. */
172: u_short *text_base;
173:
174: /* Additional command switches .*/
175:
176: /* Nonzero means do not delete tempfiles -- for debugging. */
177: int keep_tempfiles;
178:
179: /* The name this program was run with. */
180: char *program_name;
181:
182: /* 仮名文字の母音テーブル */
183: int Vowel_Table[0x60];
184:
185: /* Forward declarations of functions in this file. */
186:
187: void decode_command ();
188: void sort_in_core ();
189: void sort_offline ();
190: u_short **parsefile ();
191: u_short *find_field ();
192: u_short *find_pos ();
193: long find_value ();
194: u_short *find_braced_pos (u_short *str, int words, int chars, int ignore_blanks);
195: u_short *find_braced_end ();
196: void writelines (u_short **linearray, int nlines, FILE *ostream);
197: int compare_field ();
198: int compare_full ();
199: long readline ();
200: int merge_files ();
201: int merge_direct ();
202: void pfatal_with_name ();
203: void fatal ();
204: void error ();
205: void *xmalloc (), *xrealloc ();
206: char *concat ();
207: char *maketempname ();
208: void flush_tempfiles ();
209: char *tempcopy ();
210: static void memory_error();
211: static int wstrncmp(const u_short *s1, const u_short *s2, size_t n);
212: static int wfwrite(u_short *ptr, size_t size, size_t nmemb, FILE*stream);
213: static int wtoc(char *cptr, u_short const *ptr, int len);
214: static int Vowel_of(int c);
215: static void init_Vowel_Table(void);
216:
217: #define EUC_BYTE(c) (0x00A1 <= c && c <= 0x00FE)
218: #define G1_CHAR(c) (c & 0xFF00)
219:
220: #define MAX_IN_CORE_SORT 500000
221:
222: void
223: main (argc, argv)
224: int argc;
225: char **argv;
226: {
227: int i;
228:
229: tempcount = 0;
230: last_deleted_tempcount = 0;
231:
232: program_name = strrchr (argv[0], '/');
233: if (program_name != (char *)NULL)
234: program_name++;
235: else
236: program_name = argv[0];
237:
238: /* Describe the kind of sorting to do. */
239: /* The first keyfield uses the first braced field and folds case. */
240: keyfields[0].braced = 1;
241: keyfields[0].fold_case = 1;
242: keyfields[0].endwords = -1;
243: keyfields[0].endchars = -1;
244:
245: /* The second keyfield uses the second braced field, numerically. */
246: keyfields[1].braced = 1;
247: keyfields[1].numeric = 1;
248: keyfields[1].startwords = 1;
249: keyfields[1].endwords = -1;
250: keyfields[1].endchars = -1;
251:
252: /* The third keyfield (which is ignored while discarding duplicates)
253: compares the whole line. */
254: keyfields[2].endwords = -1;
255: keyfields[2].endchars = -1;
256:
257: decode_command (argc, argv);
258:
259: tempbase = mktemp (concat ("txiXXXXXX", "", ""));
260:
261: /* Process input files completely, one by one. */
262:
263: for (i = 0; i < num_infiles; i++)
264: {
265: int desc;
266: long ptr;
267: char *outfile;
268:
269: desc = open (infiles[i], O_RDONLY, 0);
270: if (desc < 0)
271: pfatal_with_name (infiles[i]);
272: lseek (desc, (off_t) 0, SEEK_END);
273: ptr = (long) lseek (desc, (off_t) 0, SEEK_CUR);
274:
275: close (desc);
276:
277: outfile = outfiles[i];
278: if (!outfile)
279: {
280: outfile = concat (infiles[i], "s", "");
281: }
282:
283: if (ptr < MAX_IN_CORE_SORT)
284: /* Sort a small amount of data. */
285: sort_in_core (infiles[i], ptr, outfile);
286: else
287: sort_offline (infiles[i], ptr, outfile);
288: }
289:
290: flush_tempfiles (tempcount);
291: exit (TI_NO_ERROR);
292: }
293:
294: typedef struct
295: {
296: char *long_name;
297: char *short_name;
298: int *variable_ref;
299: int variable_value;
300: char *arg_name;
301: char *doc_string;
302: } TEXINDEX_OPTION;
303:
304: TEXINDEX_OPTION texindex_options[] = {
305: { "--keep", "-k", &keep_tempfiles, 1, (char *)NULL,
306: "Keep temporary files around after processing" },
307: { "--no-keep", 0, &keep_tempfiles, 0, (char *)NULL,
308: "Do not keep temporary files around after processing (default)" },
309: { "--output", "-o", (int *)NULL, 0, "FILE",
310: "Send output to FILE" },
311: { "--version", (char *)NULL, (int *)NULL, 0, (char *)NULL,
312: "Show version information" },
313: { "--help", "-h", (int *)NULL, 0, (char *)NULL, "Produce this listing" },
314: { (char *)NULL, (char *)NULL, (int *)NULL, 0, (char *)NULL }
315: };
316:
317: void
318: usage (result_value)
319: int result_value;
320: {
321: register int i;
322:
323: fprintf (stderr, "Usage: %s [OPTIONS] FILE...\n", program_name);
324: fprintf (stderr, " Generate a permuted index for the TeX files given.\n");
325: fprintf (stderr, " Usually FILE... is `foo.??' for the source file `foo.tex'.\n");
326: fprintf (stderr, " The OPTIONS are:\n");
327:
328: for (i = 0; texindex_options[i].long_name; i++)
329: {
330: fprintf (stderr, " %s %s",
331: texindex_options[i].long_name,
332: texindex_options[i].arg_name ?
333: texindex_options[i].arg_name : "");
334:
335: if (texindex_options[i].short_name)
336: fprintf (stderr, " \n or %s %s",
337: texindex_options[i].short_name,
338: texindex_options[i].arg_name ?
339: texindex_options[i].arg_name : "");
340: fprintf (stderr, "\t%s\n", texindex_options[i].doc_string);
341: }
342: exit (result_value);
343: }
344:
345: /* Decode the command line arguments to set the parameter variables
346: and set up the vector of keyfields and the vector of input files. */
347:
348: void
349: decode_command (argc, argv)
350: int argc;
351: char **argv;
352: {
353: int arg_index = 1;
354: char **ip;
355: char **op;
356:
357: /* Store default values into parameter variables. */
358:
359: tempdir = getenv ("TMPDIR");
360: #ifdef VMS
361: if (tempdir == NULL)
362: tempdir = "sys$scratch:";
363: #else
364: if (tempdir == NULL)
365: tempdir = "/tmp/";
366: else
367: tempdir = concat (tempdir, "/", "");
368: #endif
369:
370: keep_tempfiles = 0;
371:
372: /* Allocate ARGC input files, which must be enough. */
373:
374: infiles = (char **) xmalloc (argc * sizeof (char *));
375: outfiles = (char **) xmalloc (argc * sizeof (char *));
376: ip = infiles;
377: op = outfiles;
378:
379: while (arg_index < argc)
380: {
381: char *arg = argv[arg_index++];
382:
383: if (*arg == '-')
384: {
385: if (strcmp (arg, "--version") == 0)
386: {
387: fprintf (stderr, "%s", TEXINDEX_VERSION_STRING);
388: #ifdef TEXINDEX_MODIFICATION_STRING
389: fprintf (stderr, " %s", TEXINDEX_MODIFICATION_STRING);
390: #endif
391: fprintf (stderr, "\n");
392: exit (0);
393: }
394: else if ((strcmp (arg, "--keep") == 0) ||
395: (strcmp (arg, "-k") == 0))
396: {
397: keep_tempfiles = 1;
398: }
399: else if ((strcmp (arg, "--help") == 0) ||
400: (strcmp (arg, "-h") == 0))
401: {
402: usage (0);
403: }
404: else if ((strcmp (arg, "--output") == 0) ||
405: (strcmp (arg, "-o") == 0))
406: {
407: if (argv[arg_index] != (char *)NULL)
408: {
409: arg_index++;
410: if (op > outfiles)
411: *(op - 1) = argv[arg_index];
412: }
413: else
414: usage (1);
415: }
416: else
417: usage (1);
418: }
419: else
420: {
421: *ip++ = arg;
422: *op++ = (char *)NULL;
423: }
424: }
425:
426: /* Record number of keyfields and terminate list of filenames. */
427: num_infiles = ip - infiles;
428: *ip = (char *)NULL;
429: if (num_infiles == 0)
430: usage (1);
431: }
432:
433: /* Return a name for a temporary file. */
434:
435: char *
436: maketempname (count)
437: int count;
438: {
439: char tempsuffix[10];
440: sprintf (tempsuffix, "%d", count);
441: return concat (tempdir, tempbase, tempsuffix);
442: }
443:
444: /* Delete all temporary files up to TO_COUNT. */
445:
446: void
447: flush_tempfiles (to_count)
448: int to_count;
449: {
450: if (keep_tempfiles)
451: return;
452: while (last_deleted_tempcount < to_count)
453: unlink (maketempname (++last_deleted_tempcount));
454: }
455:
456: /* Copy the input file open on IDESC into a temporary file
457: and return the temporary file name. */
458:
459: #define BUFSIZE 1024
460:
461: char *
462: tempcopy (idesc)
463: int idesc;
464: {
465: char *outfile = maketempname (++tempcount);
466: int odesc;
467: char buffer[BUFSIZE];
468:
469: odesc = open (outfile, O_WRONLY | O_CREAT, 0666);
470:
471: if (odesc < 0)
472: pfatal_with_name (outfile);
473:
474: while (1)
475: {
476: int nread = read (idesc, buffer, BUFSIZE);
477: write (odesc, buffer, nread);
478: if (!nread)
479: break;
480: }
481:
482: close (odesc);
483:
484: return outfile;
485: }
486:
487: /* Compare LINE1 and LINE2 according to the specified set of keyfields. */
488:
489: int
490: compare_full (line1, line2)
491: u_short **line1, **line2;
492: {
493: int i;
494:
495: /* Compare using the first keyfield;
496: if that does not distinguish the lines, try the second keyfield;
497: and so on. */
498:
499: for (i = 0; i < num_keyfields; i++)
500: {
501: long length1, length2;
502: u_short *start1 = find_field (&keyfields[i], *line1, &length1);
503: u_short *start2 = find_field (&keyfields[i], *line2, &length2);
504: int tem = compare_field (&keyfields[i], start1, length1, *line1 - text_base,
505: start2, length2, *line2 - text_base);
506: if (tem)
507: {
508: if (keyfields[i].reverse)
509: return -tem;
510: return tem;
511: }
512: }
513:
514: return 0; /* Lines match exactly. */
515: }
516:
517: /* Compare LINE1 and LINE2, described by structures
518: in which the first keyfield is identified in advance.
519: For positional sorting, assumes that the order of the lines in core
520: reflects their nominal order. */
521:
522: int
523: compare_prepared (line1, line2)
524: struct lineinfo *line1, *line2;
525: {
526: int i;
527: int tem;
528: u_short *text1, *text2;
529:
530: /* Compare using the first keyfield, which has been found for us already. */
531: if (keyfields->positional)
532: {
533: if (line1->text - text_base > line2->text - text_base)
534: tem = 1;
535: else
536: tem = -1;
537: }
538: else if (keyfields->numeric)
539: tem = line1->key.number - line2->key.number;
540: else
541: tem = compare_field (keyfields, line1->key.text, line1->keylen, 0,
542: line2->key.text, line2->keylen, 0);
543: if (tem)
544: {
545: if (keyfields->reverse)
546: return -tem;
547: return tem;
548: }
549:
550: text1 = line1->text;
551: text2 = line2->text;
552:
553: /* Compare using the second keyfield;
554: if that does not distinguish the lines, try the third keyfield;
555: and so on. */
556:
557: for (i = 1; i < num_keyfields; i++)
558: {
559: long length1, length2;
560: u_short *start1 = find_field (&keyfields[i], text1, &length1);
561: u_short *start2 = find_field (&keyfields[i], text2, &length2);
562: int tem = compare_field (&keyfields[i], start1, length1, text1 - text_base,
563: start2, length2, text2 - text_base);
564: if (tem)
565: {
566: if (keyfields[i].reverse)
567: return -tem;
568: return tem;
569: }
570: }
571:
572: return 0; /* Lines match exactly. */
573: }
574:
575: /* Like compare_full but more general.
576: You can pass any strings, and you can say how many keyfields to use.
577: POS1 and POS2 should indicate the nominal positional ordering of
578: the two lines in the input. */
579:
580: int
581: compare_general (str1, str2, pos1, pos2, use_keyfields)
582: u_short *str1, *str2;
583: long pos1, pos2;
584: int use_keyfields;
585: {
586: int i;
587:
588: /* Compare using the first keyfield;
589: if that does not distinguish the lines, try the second keyfield;
590: and so on. */
591:
592: for (i = 0; i < use_keyfields; i++)
593: {
594: long length1, length2;
595: u_short *start1 = find_field (&keyfields[i], str1, &length1);
596: u_short *start2 = find_field (&keyfields[i], str2, &length2);
597: int tem = compare_field (&keyfields[i], start1, length1, pos1,
598: start2, length2, pos2);
599: if (tem)
600: {
601: if (keyfields[i].reverse)
602: return -tem;
603: return tem;
604: }
605: }
606:
607: return 0; /* Lines match exactly. */
608: }
609:
610: /* Find the start and length of a field in STR according to KEYFIELD.
611: A pointer to the starting character is returned, and the length
612: is stored into the int that LENGTHPTR points to. */
613:
614: u_short *
615: find_field (keyfield, str, lengthptr)
616: struct keyfield *keyfield;
617: u_short *str;
618: long *lengthptr;
619: {
620: u_short *start;
621: u_short *end;
622: u_short *(*fun) ();
623:
624: if (keyfield->braced)
625: fun = find_braced_pos;
626: else
627: fun = find_pos;
628:
629: start = (*fun) (str, keyfield->startwords, keyfield->startchars,
630: keyfield->ignore_blanks);
631: if (keyfield->endwords < 0)
632: {
633: if (keyfield->braced)
634: end = find_braced_end (start);
635: else
636: {
637: end = start;
638: while (*end && *end != '\n')
639: end++;
640: }
641: }
642: else
643: {
644: end = (*fun) (str, keyfield->endwords, keyfield->endchars, 0);
645: if (end - str < start - str)
646: end = start;
647: }
648: *lengthptr = end - start;
649: return start;
650: }
651:
652: /* Return a pointer to a specified place within STR,
653: skipping (from the beginning) WORDS words and then CHARS chars.
654: If IGNORE_BLANKS is nonzero, we skip all blanks
655: after finding the specified word. */
656:
657: u_short *
658: find_pos (str, words, chars, ignore_blanks)
659: u_short *str;
660: int words, chars;
661: int ignore_blanks;
662: {
663: int i;
664: u_short *p = str;
665:
666: for (i = 0; i < words; i++)
667: {
668: u_short c;
669: /* Find next bunch of nonblanks and skip them. */
670: while ((c = *p) == ' ' || c == '\t')
671: p++;
672: while ((c = *p) && c != '\n' && !(c == ' ' || c == '\t'))
673: p++;
674: if (!*p || *p == '\n')
675: return p;
676: }
677:
678: while (*p == ' ' || *p == '\t')
679: p++;
680:
681: for (i = 0; i < chars; i++)
682: {
683: if (!*p || *p == '\n')
684: break;
685: p++;
686: }
687: return p;
688: }
689:
690: /* Like find_pos but assumes that each field is surrounded by braces
691: and that braces within fields are balanced. */
692:
693: u_short *
694: find_braced_pos (str, words, chars, ignore_blanks)
695: u_short *str;
696: int words, chars;
697: int ignore_blanks;
698: {
699: int i;
700: int bracelevel;
701: u_short *p = str;
702: u_short c;
703:
704: for (i = 0; i < words; i++)
705: {
706: bracelevel = 1;
707: while ((c = *p++) != '{' && c != '\n' && c)
708: /* Do nothing. */ ;
709: if (c != '{')
710: return p - 1;
711: while (bracelevel)
712: {
713: c = *p++;
714: if (c == '{')
715: bracelevel++;
716: if (c == '}')
717: bracelevel--;
718: if (c == 0 || c == '\n')
719: return p - 1;
720: }
721: }
722:
723: while ((c = *p++) != '{' && c != '\n' && c)
724: /* Do nothing. */ ;
725:
726: if (c != '{')
727: return p - 1;
728:
729: if (ignore_blanks)
730: while ((c = *p) == ' ' || c == '\t')
731: p++;
732:
733: for (i = 0; i < chars; i++)
734: {
735: if (!*p || *p == '\n')
736: break;
737: p++;
738: }
739: return p;
740: }
741:
742: /* Find the end of the balanced-brace field which starts at STR.
743: The position returned is just before the closing brace. */
744:
745: u_short *
746: find_braced_end (str)
747: u_short *str;
748: {
749: int bracelevel;
750: u_short *p = str;
751: u_short c;
752:
753: bracelevel = 1;
754: while (bracelevel)
755: {
756: c = *p++;
757: if (c == '{')
758: bracelevel++;
759: if (c == '}')
760: bracelevel--;
761: if (c == 0 || c == '\n')
762: return p - 1;
763: }
764: return p - 1;
765: }
766:
767: long
768: find_value (start, length)
769: u_short *start;
770: long length;
771: {
772: long val;
773:
774: while (length != 0L)
775: {
776: if (isdigit (*start))
777: break;
778: length--;
779: start++;
780: }
781:
782: val = 0;
783: while(isdigit(*start) && length > 0L)
784: {
785: val = val*10 + *start;
786: start++;
787: length--;
788: }
789:
790: return val;
791: }
792:
793: /* Vector used to translate characters for comparison.
794: This is how we make all alphanumerics follow all else,
795: and ignore case in the first sorting. */
796: int char_order[0x8000];
797:
798: void
799: init_char_order (void)
800: {
801: int i;
802:
803: for (i = 1; i < 0x8000; i++)
804: char_order[i] = i;
805:
806: for (i = '0'; i <= '9'; i++)
807: char_order[i] += 512;
808:
809: for (i = 'a'; i <= 'z'; i++)
810: {
811: char_order[i] = 512 + i;
812: char_order[i + 'A' - 'a'] = 512 + i;
813: }
814:
815: /* 以下のシンボルは無視 (i.e. char_order[XXX] = 0) */
816: for (i = 0x2121; i <= 0x2132; i++) /* 全角空白 〜 _ */
817: char_order[i] = 0;
818: for (i = 0x213D; i <= 0x215B; i++) /* − 〜 】 */
819: char_order[i] = 0;
820:
821: /* 促音、拗音、濁音、半濁音、片仮名/平仮名の処理 */
822: /* 1. 平仮名と片仮名、同じに扱う */
823: /* ぁ/ァ(2421/2521)〜ん/ン(2473/2573) */
824: for (i = 0x2421; i < 0x2474; i++)
825: char_order[i + 0x100] = i;
826:
827: /* 2. 促音、拗音、濁音、半濁音、類 */
828: /* ぁ〜ぉ → あ〜お */
829: for (i = 0x2421; i < 0x242B; i += 2)
830: char_order[i] = char_order[i + 0x100] = i+1;
831: /* が〜ぢ → か〜ち */
832: for (i = 0x242C; i < 0x2443; i += 2)
833: char_order[i] = char_order[i + 0x100] = i-1;
834: /* っ */
835: char_order[0x2443] = char_order[0x2543] = 0x2443+1;
836: /* づ〜ど → つ〜と */
837: for (i = 0x2445; i < 0x244A; i += 2)
838: char_order[i] = char_order[i + 0x100] = i-1;
839: /* ばぱ〜ぼぽ → は〜ほ */
840: for (i = 0x2450; i < 0x245E; i += 3)
841: {
842: char_order[i] = char_order[i + 0x100] = i-1;
843: char_order[i+1] = char_order[i+1 + 0x100] = i-1;
844: }
845: /* ゃ〜ょ → や〜よ */
846: for (i = 0x2463; i < 0x2469; i += 2)
847: char_order[i] = char_order[i + 0x100] = i+1;
848: /* ゎ → わ */
849: char_order[0x246E] = char_order[0x256E] = 0x246E +1;
850:
851: /* 3. 特殊片仮名の扱い */
852: char_order[0x2574] = 0x2426; /* ヴ → う */
853: char_order[0x2575] = 0x242B; /* ヵ → か */
854: char_order[0x2576] = 0x2431; /* ヶ → け */
855:
856: /* JISX0208のアルファベット、数字 */
857: for (i = 0x2330; i <= 0x2339; i++) /* 0〜9 -> 0-9 */
858: char_order[i] = i - 0x2330 + 512;
859: for (i = 0x2341; i <= 0x235A; i++) /* A〜Z、a〜z -> A-Z */
860: char_order[i] = char_order[i + (0x2361 - 0x2341)]
861: = 512 + (i - 0x2341) + 'a';
862:
863: /* 長音処理は比較ルーチンで行なう */
864:
865: init_Vowel_Table();
866: }
867:
868: /* Compare two fields (each specified as a start pointer and a character count)
869: according to KEYFIELD.
870: The sign of the value reports the relation between the fields. */
871:
872: int
873: compare_field (keyfield, start1, length1, pos1, start2, length2, pos2)
874: struct keyfield *keyfield;
875: u_short *start1;
876: long length1;
877: long pos1;
878: u_short *start2;
879: long length2;
880: long pos2;
881: {
882: if (keyfields->positional)
883: {
884: if (pos1 > pos2)
885: return 1;
886: else
887: return -1;
888: }
889: if (keyfield->numeric)
890: {
891: long value = find_value (start1, length1) - find_value (start2, length2);
892: if (value > 0)
893: return 1;
894: if (value < 0)
895: return -1;
896: return 0;
897: }
898: else
899: {
900: /* lexical ってか? */
901: u_short *p1 = start1;
902: u_short *p2 = start2;
903: u_short *e1 = start1 + length1;
904: u_short *e2 = start2 + length2;
905: u_short lastchar1 = 0, lastchar2 = 0;
906: int c1, c2;
907:
908: do
909: {
910: int co1, co2;
911:
912: /* char_order[X] が 0 のもの(シンボル)は無視する */
913: c1 = c2 = 0;
914: while (p1 < e1)
915: {
916: if (char_order[c1 = *p1++]) break;
917: c1 = 0;
918: }
919: while (p2 < e2)
920: {
921: if (char_order[c2 = *p2++]) break;
922: c2 = 0;
923: }
924:
925: /* 長音処理 */
926: if (c1 == 0x213C) /* ー */
927: c1 = Vowel_of(lastchar1);
928: if (c2 == 0x213C) /* ー */
929: c2 = Vowel_of(lastchar2);
930: co1 = char_order[c1];
931: co2 = char_order[c2];
932:
933: if (co1 != co2)
934: return co1 - co2;
935: else
936: lastchar1 = c1, lastchar2 = c2;
937:
938: } while(c1);
939:
940: /* Strings are equal except possibly for case. */
941: p1 = start1;
942: p2 = start2;
943: while (1)
944: {
945: int c1, c2;
946:
947: if (p1 == e1)
948: c1 = 0;
949: else
950: c1 = *p1++;
951: if (p2 == e2)
952: c2 = 0;
953: else
954: c2 = *p2++;
955:
956: if (c1 != c2)
957: /* Reverse sign here so upper case comes out last. */
958: return c2 - c1;
959: if (!c1)
960: break;
961: }
962:
963: return 0;
964: }
965: }
966:
967: /* A `struct linebuffer' is a structure which holds a line of text.
968: `readline' reads a line from a stream into a linebuffer
969: and works regardless of the length of the line. */
970:
971: struct linebuffer
972: {
973: long size;
974: u_short *buffer;
975: };
976:
977: /* Initialize LINEBUFFER for use. */
978:
979: void
980: initbuffer (linebuffer)
981: struct linebuffer *linebuffer;
982: {
983: linebuffer->size = 200;
984: linebuffer->buffer = (u_short *) xmalloc (200*sizeof(u_short));
985: }
986:
987: /* Read a line of text from STREAM into LINEBUFFER.
988: Return the length of the line. */
989:
990: long
991: readline (linebuffer, stream)
992: struct linebuffer *linebuffer;
993: FILE *stream;
994: {
995: u_short *buffer = linebuffer->buffer;
996: u_short *p = linebuffer->buffer;
997: u_short *end = p + linebuffer->size;
998:
999: while (1)
1000: {
1001: int c = getc (stream);
1002: if (p == end)
1003: {
1004: buffer =
1005: (u_short *) xrealloc (buffer,
1006: (linebuffer->size *= 2)*sizeof(u_short));
1007: p += buffer - linebuffer->buffer;
1008: end += buffer - linebuffer->buffer;
1009: linebuffer->buffer = buffer;
1010: }
1011: if (c < 0 || c == '\n')
1012: {
1013: *p = 0;
1014: break;
1015: }
1016: *p++ = c;
1017: }
1018:
1019: return p - buffer;
1020: }
1021:
1022: /* Sort an input file too big to sort in core. */
1023:
1024: void
1025: sort_offline (infile, nfiles, total, outfile)
1026: char *infile;
1027: int nfiles;
1028: long total;
1029: char *outfile;
1030: {
1031: /* More than enough. */
1032: int ntemps = 2 * (total + MAX_IN_CORE_SORT - 1) / MAX_IN_CORE_SORT;
1033: char **tempfiles = (char **) xmalloc (ntemps * sizeof (char *));
1034: FILE *istream = fopen (infile, "r");
1035: int i;
1036: struct linebuffer lb;
1037: long linelength;
1038: int failure = 0;
1039:
1040: initbuffer (&lb);
1041:
1042: /* Read in one line of input data. */
1043:
1044: linelength = readline (&lb, istream);
1045:
1046: if (lb.buffer[0] != '\\' && lb.buffer[0] != '@')
1047: {
1048: error ("%s: not a texinfo index file", infile);
1049: return;
1050: }
1051:
1052: /* Split up the input into `ntemps' temporary files, or maybe fewer,
1053: and put the new files' names into `tempfiles' */
1054:
1055: for (i = 0; i < ntemps; i++)
1056: {
1057: char *outname = maketempname (++tempcount);
1058: FILE *ostream = fopen (outname, "w");
1059: long tempsize = 0;
1060:
1061: if (!ostream)
1062: pfatal_with_name (outname);
1063: tempfiles[i] = outname;
1064:
1065: /* Copy lines into this temp file as long as it does not make file
1066: "too big" or until there are no more lines. */
1067:
1068: while (tempsize + linelength + 1 <= MAX_IN_CORE_SORT)
1069: {
1070: tempsize += linelength + 1;
1071: {
1072: u_short *tp;
1073:
1074: for (tp = lb.buffer; *tp; tp++)
1075: putc(*tp, ostream);
1076: }
1077: putc ('\n', ostream);
1078:
1079: /* Read another line of input data. */
1080:
1081: linelength = readline (&lb, istream);
1082: if (!linelength && feof (istream))
1083: break;
1084:
1085: if (lb.buffer[0] != '\\' && lb.buffer[0] != '@')
1086: {
1087: error ("%s: not a texinfo index file", infile);
1088: failure = 1;
1089: goto fail;
1090: }
1091: }
1092: fclose (ostream);
1093: if (feof (istream))
1094: break;
1095: }
1096:
1097: free (lb.buffer);
1098:
1099: fail:
1100: /* Record number of temp files we actually needed. */
1101:
1102: ntemps = i;
1103:
1104: /* Sort each tempfile into another tempfile.
1105: Delete the first set of tempfiles and put the names of the second
1106: into `tempfiles'. */
1107:
1108: for (i = 0; i < ntemps; i++)
1109: {
1110: char *newtemp = maketempname (++tempcount);
1111: sort_in_core (&tempfiles[i], MAX_IN_CORE_SORT, newtemp);
1112: if (!keep_tempfiles)
1113: unlink (tempfiles[i]);
1114: tempfiles[i] = newtemp;
1115: }
1116:
1117: if (failure)
1118: return;
1119:
1120: /* Merge the tempfiles together and indexify. */
1121:
1122: merge_files (tempfiles, ntemps, outfile);
1123: }
1124:
1125: /* Sort INFILE, whose size is TOTAL,
1126: assuming that is small enough to be done in-core,
1127: then indexify it and send the output to OUTFILE (or to stdout). */
1128:
1129: void
1130: sort_in_core (infile, total, outfile)
1131: char *infile;
1132: long total;
1133: char *outfile;
1134: {
1135: u_short **nextline;
1136: u_short *data = (u_short *) xmalloc ((total + 1)*sizeof(u_short));
1137: u_short *file_data;
1138: long file_size;
1139: int i;
1140: FILE *ostream = stdout;
1141: struct lineinfo *lineinfo;
1142: FILE *ifp;
1143: int c;
1144:
1145: /* Read the contents of the file into the moby array `data'. */
1146: ifp = fopen(infile, "r");
1147: if (!ifp)
1148: fatal ("failure reopening %s", infile);
1149: memset(data, 0, (total + 1)*sizeof(u_short)); /* zero clear */
1150: for (i = 0, c = fgetc(ifp); c != EOF; c = fgetc(ifp), i++)
1151: {
1152: if (EUC_BYTE(c))
1153: {
1154: data[i] = (c & 0x007F) << 8;
1155: c = fgetc(ifp) & 0x007F;
1156: }
1157: data[i] += c;
1158: }
1159: file_size = i;
1160: file_data = data;
1161: data[file_size] = 0;
1162:
1163: fclose(ifp);
1164:
1165: if (file_size > 0 && data[0] != '\\' && data[0] != '@')
1166: {
1167: error ("%s: not a texinfo index file", infile);
1168: return;
1169: }
1170:
1171: init_char_order ();
1172:
1173: /* Sort routines want to know this address. */
1174:
1175: text_base = data;
1176:
1177: /* Create the array of pointers to lines, with a default size
1178: frequently enough. */
1179:
1180: nlines = total / 50;
1181: if (!nlines)
1182: nlines = 2;
1183: linearray = (u_short **) xmalloc (nlines * sizeof (u_short *));
1184:
1185: /* `nextline' points to the next free slot in this array.
1186: `nlines' is the allocated size. */
1187:
1188: nextline = linearray;
1189:
1190: /* Parse the input file's data, and make entries for the lines. */
1191:
1192: nextline = parsefile (nextline, file_data, file_size);
1193:
1194: if (nextline == 0)
1195: {
1196: error ("%s: not a texinfo index file", infile);
1197: return;
1198: }
1199:
1200: /* Sort the lines. */
1201:
1202: /* If we have enough space, find the first keyfield of each line in advance.
1203: Make a `struct lineinfo' for each line, which records the keyfield
1204: as well as the line, and sort them. */
1205:
1206: lineinfo = (struct lineinfo *) malloc ((nextline - linearray) * sizeof (struct lineinfo));
1207:
1208: if (lineinfo)
1209: {
1210: struct lineinfo *lp;
1211: u_short **p;
1212:
1213: for (lp = lineinfo, p = linearray; p != nextline; lp++, p++)
1214: {
1215: lp->text = *p;
1216: lp->key.text = find_field (keyfields, *p, &lp->keylen);
1217: if (keyfields->numeric)
1218: lp->key.number = find_value (lp->key.text, lp->keylen);
1219: }
1220:
1221: qsort (lineinfo, nextline - linearray, sizeof (struct lineinfo), compare_prepared);
1222:
1223: for (lp = lineinfo, p = linearray; p != nextline; lp++, p++)
1224: *p = lp->text;
1225:
1226: free (lineinfo);
1227: }
1228: else
1229: qsort (linearray, nextline - linearray, sizeof (u_short *), compare_full);
1230:
1231: /* Open the output file. */
1232:
1233: if (outfile)
1234: {
1235: ostream = fopen (outfile, "w");
1236: if (!ostream)
1237: pfatal_with_name (outfile);
1238: }
1239:
1240: writelines (linearray, nextline - linearray, ostream);
1241: if (outfile)
1242: fclose (ostream);
1243:
1244: free (linearray);
1245: free (data);
1246: }
1247:
1248: /* Parse an input string in core into lines.
1249: DATA is the input string, and SIZE is its length.
1250: Data goes in LINEARRAY starting at NEXTLINE.
1251: The value returned is the first entry in LINEARRAY still unused.
1252: Value 0 means input file contents are invalid. */
1253:
1254: u_short **
1255: parsefile (nextline, data, size)
1256: u_short **nextline;
1257: u_short *data;
1258: long size;
1259: {
1260: u_short *p, *end;
1261: u_short **line = nextline;
1262:
1263: p = data;
1264: end = p + size;
1265: *end = 0;
1266:
1267: while (p != end)
1268: {
1269: if (p[0] != '\\' && p[0] != '@')
1270: return 0;
1271:
1272: *line = p;
1273: while (*p && *p != '\n')
1274: p++;
1275: if (p != end)
1276: p++;
1277:
1278: line++;
1279: if (line == linearray + nlines)
1280: {
1281: u_short **old = linearray;
1282: linearray =
1283: (u_short **) xrealloc (linearray,
1284: sizeof (u_short *) * (nlines *= 4));
1285: line += linearray - old;
1286: }
1287: }
1288:
1289: return line;
1290: }
1291:
1292: /* Indexification is a filter applied to the sorted lines
1293: as they are being written to the output file.
1294: Multiple entries for the same name, with different page numbers,
1295: get combined into a single entry with multiple page numbers.
1296: The first braced field, which is used for sorting, is discarded.
1297: However, its first character is examined, folded to lower case,
1298: and if it is different from that in the previous line fed to us
1299: a \initial line is written with one argument, the new initial.
1300:
1301: If an entry has four braced fields, then the second and third
1302: constitute primary and secondary names.
1303: In this case, each change of primary name
1304: generates a \primary line which contains only the primary name,
1305: and in between these are \secondary lines which contain
1306: just a secondary name and page numbers. */
1307:
1308: /* The last primary name we wrote a \primary entry for.
1309: If only one level of indexing is being done, this is the last name seen. */
1310: u_short *lastprimary;
1311: /* Length of storage allocated for lastprimary. */
1312: int lastprimarylength;
1313:
1314: /* Similar, for the secondary name. */
1315: u_short *lastsecondary;
1316: int lastsecondarylength;
1317:
1318: /* Zero if we are not in the middle of writing an entry.
1319: One if we have written the beginning of an entry but have not
1320: yet written any page numbers into it.
1321: Greater than one if we have written the beginning of an entry
1322: plus at least one page number. */
1323: int pending;
1324:
1325: /* The initial (for sorting purposes) of the last primary entry written.
1326: When this changes, a \initial {c} line is written */
1327:
1328: u_short *lastinitial;
1329:
1330: int lastinitiallength;
1331:
1332: /* When we need a string of length 1 for the value of lastinitial,
1333: store it here. */
1334:
1335: u_short lastinitial1[2];
1336:
1337: /* Initialize static storage for writing an index. */
1338:
1339: void
1340: init_index ()
1341: {
1342: pending = 0;
1343: lastinitial = lastinitial1;
1344: lastinitial1[0] = 0;
1345: lastinitial1[1] = 0;
1346: lastinitiallength = 0;
1347: lastprimarylength = 100;
1348: lastprimary = (u_short *) xmalloc ((lastprimarylength + 1)*sizeof(u_short));
1349: memset (lastprimary, '\0', (lastprimarylength + 1)*sizeof(u_short));
1350: lastsecondarylength = 100;
1351: lastsecondary =
1352: (u_short *) xmalloc ((lastsecondarylength + 1)*sizeof(u_short));
1353: memset (lastsecondary, '\0', (lastsecondarylength + 1)*sizeof(u_short));
1354: }
1355:
1356: /* Indexify. Merge entries for the same name,
1357: insert headers for each initial character, etc. */
1358:
1359: void
1360: indexify (u_short *line, FILE *ostream)
1361: {
1362: u_short *primary, *secondary, *pagenumber;
1363: int primarylength, secondarylength = 0, pagelength;
1364: int nosecondary;
1365: int initiallength;
1366: u_short *initial;
1367: u_short initial1[2];
1368: register u_short *p;
1369:
1370: /* First, analyze the parts of the entry fed to us this time. */
1371:
1372: p = find_braced_pos (line, 0, 0, 0);
1373: if (*p == '{')
1374: {
1375: initial = p;
1376: /* Get length of inner pair of braces starting at `p',
1377: including that inner pair of braces. */
1378: initiallength = find_braced_end (p + 1) + 1 - p;
1379: }
1380: else
1381: {
1382: initial = initial1;
1383: initial1[0] = *p & 0x7F7F;
1384: initial1[1] = 0;
1385: initiallength = 1;
1386:
1387: if (initial1[0] >= 'a' && initial1[0] <= 'z')
1388: initial1[0] -= ' ';
1389: else if (initial1[0] > 0x2000)
1390: initial1[0] = char_order[initial1[0]];
1391: }
1392:
1393: pagenumber = find_braced_pos (line, 1, 0, 0);
1394: pagelength = find_braced_end (pagenumber) - pagenumber;
1395: if (pagelength == 0)
1396: abort ();
1397:
1398: primary = find_braced_pos (line, 2, 0, 0);
1399: primarylength = find_braced_end (primary) - primary;
1400:
1401: secondary = find_braced_pos (line, 3, 0, 0);
1402: nosecondary = !*secondary;
1403: if (!nosecondary)
1404: secondarylength = find_braced_end (secondary) - secondary;
1405:
1406: /* If the primary is different from before, make a new primary entry. */
1407: if (wstrncmp (primary, lastprimary, primarylength))
1408: {
1409: /* Close off current secondary entry first, if one is open. */
1410: if (pending)
1411: {
1412: fputs ("}\n", ostream);
1413: pending = 0;
1414: }
1415:
1416: /* If this primary has a different initial, include an entry for
1417: the initial. */
1418: if (initiallength != lastinitiallength ||
1419: (*initial && wstrncmp (initial, lastinitial, initiallength)))
1420: {
1421: fprintf (ostream, "\\initial {");
1422: wfwrite (initial, initiallength, 1, ostream);
1423: fprintf (ostream, "}\n", initial);
1424: if (initial == initial1)
1425: {
1426: lastinitial = lastinitial1;
1427: *lastinitial1 = *initial1;
1428: }
1429: else
1430: {
1431: lastinitial = initial;
1432: }
1433: lastinitiallength = initiallength;
1434: }
1435:
1436: /* Make the entry for the primary. */
1437: if (nosecondary)
1438: fputs ("\\entry {", ostream);
1439: else
1440: fputs ("\\primary {", ostream);
1441: wfwrite (primary, primarylength, 1, ostream);
1442: if (nosecondary)
1443: {
1444: fputs ("}{", ostream);
1445: pending = 1;
1446: }
1447: else
1448: fputs ("}\n", ostream);
1449:
1450: /* Record name of most recent primary. */
1451: if (lastprimarylength < primarylength)
1452: {
1453: lastprimarylength = primarylength + 100;
1454: lastprimary =
1455: (u_short *) xrealloc (lastprimary,
1456: (1 + lastprimarylength)*sizeof(u_short));
1457: }
1458: memcpy (lastprimary, primary, primarylength*sizeof(u_short));
1459: lastprimary[primarylength] = 0;
1460:
1461: /* There is no current secondary within this primary, now. */
1462: lastsecondary[0] = 0;
1463: }
1464:
1465: /* Should not have an entry with no subtopic following one with a subtopic. */
1466:
1467: if (nosecondary && *lastsecondary)
1468: {
1469: char tline[2001];
1470: int len;
1471:
1472: len = wtoc(tline, line, 2000); /* 2000 is much enough ? */
1473: tline[len] = '\0';
1474: error ("entry %s follows an entry with a secondary name", tline);
1475: }
1476:
1477: /* Start a new secondary entry if necessary. */
1478: if (!nosecondary && wstrncmp (secondary, lastsecondary, secondarylength))
1479: {
1480: if (pending)
1481: {
1482: fputs ("}\n", ostream);
1483: pending = 0;
1484: }
1485:
1486: /* Write the entry for the secondary. */
1487: fputs ("\\secondary {", ostream);
1488: wfwrite (secondary, secondarylength, 1, ostream);
1489: fputs ("}{", ostream);
1490: pending = 1;
1491:
1492: /* Record name of most recent secondary. */
1493: if (lastsecondarylength < secondarylength)
1494: {
1495: lastsecondarylength = secondarylength + 100;
1496: lastsecondary =
1497: (u_short *) xrealloc ((lastsecondary,
1498: 1 + lastsecondarylength)*sizeof(u_short));
1499: }
1500: memcpy (lastsecondary, secondary, secondarylength * sizeof(u_short));
1501: lastsecondary[secondarylength] = 0;
1502: }
1503:
1504: /* Here to add one more page number to the current entry. */
1505: if (pending++ != 1)
1506: fputs (", ", ostream); /* Punctuate first, if this is not the first. */
1507: wfwrite (pagenumber, pagelength, 1, ostream);
1508: }
1509:
1510: /* Close out any unfinished output entry. */
1511:
1512: void
1513: finish_index (ostream)
1514: FILE *ostream;
1515: {
1516: if (pending)
1517: fputs ("}\n", ostream);
1518: free (lastprimary);
1519: free (lastsecondary);
1520: }
1521:
1522: /* Copy the lines in the sorted order.
1523: Each line is copied out of the input file it was found in. */
1524:
1525: void
1526: writelines (linearray, nlines, ostream)
1527: u_short **linearray;
1528: int nlines;
1529: FILE *ostream;
1530: {
1531: u_short **stop_line = linearray + nlines;
1532: u_short **next_line;
1533:
1534: init_index ();
1535:
1536: /* Output the text of the lines, and free the buffer space. */
1537:
1538: for (next_line = linearray; next_line != stop_line; next_line++)
1539: {
1540: /* If -u was specified, output the line only if distinct from previous one. */
1541: if (next_line == linearray
1542: /* Compare previous line with this one, using only the
1543: explicitly specd keyfields. */
1544: || compare_general (*(next_line - 1), *next_line, 0L, 0L, num_keyfields - 1))
1545: {
1546: u_short *p = *next_line;
1547: u_short c;
1548:
1549: while ((c = *p++) && c != '\n')
1550: /* Do nothing. */ ;
1551: *(p - 1) = 0;
1552: indexify (*next_line, ostream);
1553: }
1554: }
1555:
1556: finish_index (ostream);
1557: }
1558:
1559: /* Assume (and optionally verify) that each input file is sorted;
1560: merge them and output the result.
1561: Returns nonzero if any input file fails to be sorted.
1562:
1563: This is the high-level interface that can handle an unlimited
1564: number of files. */
1565:
1566: #define MAX_DIRECT_MERGE 10
1567:
1568: int
1569: merge_files (infiles, nfiles, outfile)
1570: char **infiles;
1571: int nfiles;
1572: char *outfile;
1573: {
1574: char **tempfiles;
1575: int ntemps;
1576: int i;
1577: int value = 0;
1578: int start_tempcount = tempcount;
1579:
1580: if (nfiles <= MAX_DIRECT_MERGE)
1581: return merge_direct (infiles, nfiles, outfile);
1582:
1583: /* Merge groups of MAX_DIRECT_MERGE input files at a time,
1584: making a temporary file to hold each group's result. */
1585:
1586: ntemps = (nfiles + MAX_DIRECT_MERGE - 1) / MAX_DIRECT_MERGE;
1587: tempfiles = (char **) xmalloc (ntemps * sizeof (char *));
1588: for (i = 0; i < ntemps; i++)
1589: {
1590: int nf = MAX_DIRECT_MERGE;
1591: if (i + 1 == ntemps)
1592: nf = nfiles - i * MAX_DIRECT_MERGE;
1593: tempfiles[i] = maketempname (++tempcount);
1594: value |= merge_direct (&infiles[i * MAX_DIRECT_MERGE], nf, tempfiles[i]);
1595: }
1596:
1597: /* All temporary files that existed before are no longer needed
1598: since their contents have been merged into our new tempfiles.
1599: So delete them. */
1600: flush_tempfiles (start_tempcount);
1601:
1602: /* Now merge the temporary files we created. */
1603:
1604: merge_files (tempfiles, ntemps, outfile);
1605:
1606: free (tempfiles);
1607:
1608: return value;
1609: }
1610:
1611: /* Assume (and optionally verify) that each input file is sorted;
1612: merge them and output the result.
1613: Returns nonzero if any input file fails to be sorted.
1614:
1615: This version of merging will not work if the number of
1616: input files gets too high. Higher level functions
1617: use it only with a bounded number of input files. */
1618:
1619: int
1620: merge_direct (infiles, nfiles, outfile)
1621: char **infiles;
1622: int nfiles;
1623: char *outfile;
1624: {
1625: struct linebuffer *lb1, *lb2;
1626: struct linebuffer **thisline, **prevline;
1627: FILE **streams;
1628: int i;
1629: int nleft;
1630: int lossage = 0;
1631: int *file_lossage;
1632: struct linebuffer *prev_out = 0;
1633: FILE *ostream = stdout;
1634:
1635: if (outfile)
1636: {
1637: ostream = fopen (outfile, "w");
1638: }
1639: if (!ostream)
1640: pfatal_with_name (outfile);
1641:
1642: init_index ();
1643:
1644: if (nfiles == 0)
1645: {
1646: if (outfile)
1647: fclose (ostream);
1648: return 0;
1649: }
1650:
1651: /* For each file, make two line buffers.
1652: Also, for each file, there is an element of `thisline'
1653: which points at any time to one of the file's two buffers,
1654: and an element of `prevline' which points to the other buffer.
1655: `thisline' is supposed to point to the next available line from the file,
1656: while `prevline' holds the last file line used,
1657: which is remembered so that we can verify that the file is properly sorted. */
1658:
1659: /* lb1 and lb2 contain one buffer each per file. */
1660: lb1 = (struct linebuffer *) xmalloc (nfiles * sizeof (struct linebuffer));
1661: lb2 = (struct linebuffer *) xmalloc (nfiles * sizeof (struct linebuffer));
1662:
1663: /* thisline[i] points to the linebuffer holding the next available line in file i,
1664: or is zero if there are no lines left in that file. */
1665: thisline = (struct linebuffer **)
1666: xmalloc (nfiles * sizeof (struct linebuffer *));
1667: /* prevline[i] points to the linebuffer holding the last used line
1668: from file i. This is just for verifying that file i is properly
1669: sorted. */
1670: prevline = (struct linebuffer **)
1671: xmalloc (nfiles * sizeof (struct linebuffer *));
1672: /* streams[i] holds the input stream for file i. */
1673: streams = (FILE **) xmalloc (nfiles * sizeof (FILE *));
1674: /* file_lossage[i] is nonzero if we already know file i is not
1675: properly sorted. */
1676: file_lossage = (int *) xmalloc (nfiles * sizeof (int));
1677:
1678: /* Allocate and initialize all that storage. */
1679:
1680: for (i = 0; i < nfiles; i++)
1681: {
1682: initbuffer (&lb1[i]);
1683: initbuffer (&lb2[i]);
1684: thisline[i] = &lb1[i];
1685: prevline[i] = &lb2[i];
1686: file_lossage[i] = 0;
1687: streams[i] = fopen (infiles[i], "r");
1688: if (!streams[i])
1689: pfatal_with_name (infiles[i]);
1690:
1691: readline (thisline[i], streams[i]);
1692: }
1693:
1694: /* Keep count of number of files not at eof. */
1695: nleft = nfiles;
1696:
1697: while (nleft)
1698: {
1699: struct linebuffer *best = 0;
1700: struct linebuffer *exch;
1701: int bestfile = -1;
1702: int i;
1703:
1704: /* Look at the next avail line of each file; choose the least one. */
1705:
1706: for (i = 0; i < nfiles; i++)
1707: {
1708: if (thisline[i] &&
1709: (!best ||
1710: 0 < compare_general (best->buffer, thisline[i]->buffer,
1711: (long) bestfile, (long) i, num_keyfields)))
1712: {
1713: best = thisline[i];
1714: bestfile = i;
1715: }
1716: }
1717:
1718: /* Output that line, unless it matches the previous one and we
1719: don't want duplicates. */
1720:
1721: if (!(prev_out &&
1722: !compare_general (prev_out->buffer,
1723: best->buffer, 0L, 1L, num_keyfields - 1)))
1724: indexify (best->buffer, ostream);
1725: prev_out = best;
1726:
1727: /* Now make the line the previous of its file, and fetch a new
1728: line from that file. */
1729:
1730: exch = prevline[bestfile];
1731: prevline[bestfile] = thisline[bestfile];
1732: thisline[bestfile] = exch;
1733:
1734: while (1)
1735: {
1736: /* If the file has no more, mark it empty. */
1737:
1738: if (feof (streams[bestfile]))
1739: {
1740: thisline[bestfile] = 0;
1741: /* Update the number of files still not empty. */
1742: nleft--;
1743: break;
1744: }
1745: readline (thisline[bestfile], streams[bestfile]);
1746: if (thisline[bestfile]->buffer[0] || !feof (streams[bestfile]))
1747: break;
1748: }
1749: }
1750:
1751: finish_index (ostream);
1752:
1753: /* Free all storage and close all input streams. */
1754:
1755: for (i = 0; i < nfiles; i++)
1756: {
1757: fclose (streams[i]);
1758: free (lb1[i].buffer);
1759: free (lb2[i].buffer);
1760: }
1761: free (file_lossage);
1762: free (lb1);
1763: free (lb2);
1764: free (thisline);
1765: free (prevline);
1766: free (streams);
1767:
1768: if (outfile)
1769: fclose (ostream);
1770:
1771: return lossage;
1772: }
1773:
1774: /* Print error message and exit. */
1775:
1776: void
1777: fatal (format, arg)
1778: char *format, *arg;
1779: {
1780: error (format, arg);
1781: exit (TI_FATAL_ERROR);
1782: }
1783:
1784: /* Print error message. FORMAT is printf control string, ARG is arg for it. */
1785: void
1786: error (format, arg)
1787: char *format, *arg;
1788: {
1789: printf ("%s: ", program_name);
1790: printf (format, arg);
1791: if (format[strlen (format) -1] != '\n')
1792: printf ("\n");
1793: }
1794:
1795: void
1796: perror_with_name (name)
1797: char *name;
1798: {
1799: char *s;
1800:
1801: s = strerror (errno);
1802: printf ("%s: ", program_name);
1803: printf ("%s; for file `%s'.\n", s, name);
1804: }
1805:
1806: void
1807: pfatal_with_name (name)
1808: char *name;
1809: {
1810: char *s;
1811:
1812: s = strerror (errno);
1813: printf ("%s: ", program_name);
1814: printf ("%s; for file `%s'.\n", s, name);
1815: exit (TI_FATAL_ERROR);
1816: }
1817:
1818: /* Return a newly-allocated string whose contents concatenate those of
1819: S1, S2, S3. */
1820:
1821: char *
1822: concat (s1, s2, s3)
1823: char *s1, *s2, *s3;
1824: {
1825: int len1 = strlen (s1), len2 = strlen (s2), len3 = strlen (s3);
1826: char *result = (char *) xmalloc (len1 + len2 + len3 + 1);
1827:
1828: strcpy (result, s1);
1829: strcpy (result + len1, s2);
1830: strcpy (result + len1 + len2, s3);
1831: *(result + len1 + len2 + len3) = 0;
1832:
1833: return result;
1834: }
1835:
1836: #if !defined (HAVE_STRERROR)
1837: extern char *sys_errlist[];
1838: extern int sys_nerr;
1839:
1840: char *
1841: strerror (num)
1842: int num;
1843: {
1844: if (num >= sys_nerr)
1845: return ("");
1846: else
1847: return (sys_errlist[num]);
1848: }
1849: #endif /* !HAVE_STRERROR */
1850:
1851: #if !defined (HAVE_STRCHR)
1852: char *
1853: strrchr (string, character)
1854: char *string;
1855: int character;
1856: {
1857: register int i;
1858:
1859: for (i = strlen (string) - 1; i > -1; i--)
1860: if (string[i] == character)
1861: return (string + i);
1862:
1863: return ((char *)NULL);
1864: }
1865: #endif /* HAVE_STRCHR */
1866:
1867: /* Just like malloc, but kills the program in case of fatal error. */
1868: void *
1869: xmalloc (nbytes)
1870: int nbytes;
1871: {
1872: void *temp = (void *) malloc (nbytes);
1873:
1874: if (nbytes && temp == (void *)NULL)
1875: memory_error ("xmalloc", nbytes);
1876:
1877: return (temp);
1878: }
1879:
1880: /* Like realloc (), but barfs if there isn't enough memory. */
1881: void *
1882: xrealloc (pointer, nbytes)
1883: void *pointer;
1884: int nbytes;
1885: {
1886: void *temp;
1887:
1888: if (!pointer)
1889: temp = (void *)xmalloc (nbytes);
1890: else
1891: temp = (void *)realloc (pointer, nbytes);
1892:
1893: if (nbytes && !temp)
1894: memory_error ("xrealloc", nbytes);
1895:
1896: return (temp);
1897: }
1898:
1899: void
1900: memory_error (callers_name, bytes_wanted)
1901: char *callers_name;
1902: int bytes_wanted;
1903: {
1904: char printable_string[80];
1905:
1906: sprintf (printable_string,
1907: "Virtual memory exhausted in %s ()! Needed %d bytes.",
1908: callers_name, bytes_wanted);
1909:
1910: error (printable_string);
1911: abort();
1912: }
1913:
1914:
1915: static int
1916: wstrncmp(const u_short *s1, const u_short *s2, size_t n)
1917: {
1918: int i;
1919: int d = 0;
1920:
1921: for (i=0; i<n && d == 0; i++)
1922: d = s1[i] - s2[i];
1923:
1924: return d;
1925: }
1926:
1927:
1928: static int
1929: wfwrite(u_short *ptr, size_t size, size_t nmemb, FILE*stream)
1930: {
1931: char *cptr;
1932: int ret, len;
1933:
1934: cptr = (char *)xmalloc(size * nmemb * sizeof(u_short)); /* 丼一杯! */
1935: len = wtoc(cptr, ptr, size*nmemb);
1936: assert(len <= (size * nmemb * sizeof(u_short)));
1937:
1938: ret = fwrite(cptr, len, nmemb, stream);
1939: free(cptr);
1940: return ret;
1941: }
1942:
1943:
1944: static int
1945: wtoc(char *cptr, u_short const *ptr, int len)
1946: {
1947: char *tptr;
1948: u_short c;
1949: int i;
1950:
1951: tptr = cptr;
1952: for (i = 0; i < len; i++)
1953: {
1954: c = ptr[i];
1955: if (G1_CHAR(c))
1956: {
1957: *tptr++ = c >> 8 | 0x0080; /* to EUC */
1958: c &= 0x007F;
1959: c |= 0x0080;
1960: }
1961: *tptr++ = c;
1962: }
1963:
1964: return tptr - cptr;
1965: }
1966:
1967:
1968: static int
1969: Vowel_of(int c)
1970: {
1971: if (0x2521 <= c && c < 0x2577) /* 片仮名は */
1972: c -= 0x100; /* 平仮名へ */
1973: /* ヴ、ヵ、ヶ、の3字は相当する平仮名はないが、コードがあると仮定する */
1974: /* 普通は問題にならない筈。 */
1975:
1976: if (0x2421 <= c && c < 0x2474)
1977: return Vowel_Table[c - 0x2420];
1978: else
1979: return c; /* 平仮名以外はそのまま */
1980: }
1981:
1982:
1983: static void
1984: init_Vowel_Table(void)
1985: {
1986: int i;
1987:
1988: /* あ〜ぞ */
1989: for (i = 0x2421 - 0x2420; i < 0x2445 - 0x2420; i += 10)
1990: {
1991: Vowel_Table[i] = Vowel_Table[i+1] = 0x2422; /* あ段 */
1992: Vowel_Table[i+2] = Vowel_Table[i+3] = 0x2424; /* い段 */
1993: Vowel_Table[i+4] = Vowel_Table[i+5] = 0x2426; /* う段 */
1994: Vowel_Table[i+6] = Vowel_Table[i+7] = 0x2428; /* え段 */
1995: Vowel_Table[i+8] = Vowel_Table[i+9] = 0x242A; /* お段 */
1996: }
1997: Vowel_Table[0x243F - 0x2420] = 0x2422; /* た */
1998: Vowel_Table[0x2440 - 0x2420] = 0x2422; /* だ */
1999: Vowel_Table[0x2441 - 0x2420] = 0x2424; /* ち */
2000: Vowel_Table[0x2442 - 0x2420] = 0x2424; /* ぢ */
2001: Vowel_Table[0x2443 - 0x2420] = 0x2426; /* っ */
2002: Vowel_Table[0x2444 - 0x2420] = 0x2426; /* つ */
2003: Vowel_Table[0x2445 - 0x2420] = 0x2426; /* づ */
2004: Vowel_Table[0x2446 - 0x2420] = 0x2428; /* て */
2005: Vowel_Table[0x2447 - 0x2420] = 0x2428; /* で */
2006: Vowel_Table[0x2448 - 0x2420] = 0x242A; /* と */
2007: Vowel_Table[0x2449 - 0x2420] = 0x242A; /* ど */
2008: Vowel_Table[0x244A - 0x2420] = 0x2422; /* な */
2009: Vowel_Table[0x244B - 0x2420] = 0x2424; /* に */
2010: Vowel_Table[0x244C - 0x2420] = 0x2426; /* ぬ */
2011: Vowel_Table[0x244D - 0x2420] = 0x2428; /* ね */
2012: Vowel_Table[0x244E - 0x2420] = 0x242A; /* の */
2013: /* は〜ぽ */
2014: for (i = 0x244F - 0x2420; i < 0x245E - 0x2420; i += 15)
2015: {
2016: Vowel_Table[i] = Vowel_Table[i+1] = Vowel_Table[i+2] = 0x2422;
2017: Vowel_Table[i+3] = Vowel_Table[i+4] = Vowel_Table[i+5] = 0x2424;
2018: Vowel_Table[i+6] = Vowel_Table[i+7] = Vowel_Table[i+8] = 0x2426;
2019: Vowel_Table[i+9] = Vowel_Table[i+11] = Vowel_Table[i+12] = 0x2428;
2020: Vowel_Table[i+12] = Vowel_Table[i+13] = Vowel_Table[i+14] = 0x242A;
2021: }
2022: Vowel_Table[0x245E - 0x2420] = 0x2422; /* ま */
2023: Vowel_Table[0x245F - 0x2420] = 0x2424; /* み */
2024: Vowel_Table[0x2460 - 0x2420] = 0x2426; /* む */
2025: Vowel_Table[0x2461 - 0x2420] = 0x2428; /* め */
2026: Vowel_Table[0x2462 - 0x2420] = 0x242A; /* も */
2027: Vowel_Table[0x2463 - 0x2420] = 0x2422; /* ゃ */
2028: Vowel_Table[0x2464 - 0x2420] = 0x2422; /* や */
2029: Vowel_Table[0x2465 - 0x2420] = 0x2426; /* ゅ */
2030: Vowel_Table[0x2466 - 0x2420] = 0x2426; /* ゆ */
2031: Vowel_Table[0x2467 - 0x2420] = 0x242A; /* ょ */
2032: Vowel_Table[0x2468 - 0x2420] = 0x242A; /* よ */
2033: Vowel_Table[0x2469 - 0x2420] = 0x2422; /* ら */
2034: Vowel_Table[0x246A - 0x2420] = 0x2424; /* り */
2035: Vowel_Table[0x246B - 0x2420] = 0x2426; /* る */
2036: Vowel_Table[0x246C - 0x2420] = 0x2428; /* れ */
2037: Vowel_Table[0x246D - 0x2420] = 0x242A; /* ろ */
2038: Vowel_Table[0x246E - 0x2420] = 0x2422; /* ゎ */
2039: Vowel_Table[0x246F - 0x2420] = 0x2422; /* わ */
2040: /* ゐ〜ヶ は、そのまま。母音への変換はしない。 */
2041: for (i = 0x2470 - 0x2420; i < 0x2477 - 0x2420; i++)
2042: Vowel_Table[i] = i;
2043: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>