1 /*        $NetBSD: fields.c,v 1.33 2013/01/20 10:12:58 apb Exp $      */
2 
3 /*-
4  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Ben Harris and Jaromir Dolecek.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*-
33  * Copyright (c) 1993
34  *        The Regents of the University of California.  All rights reserved.
35  *
36  * This code is derived from software contributed to Berkeley by
37  * Peter McIlroy.
38  *
39  * Redistribution and use in source and binary forms, with or without
40  * modification, are permitted provided that the following conditions
41  * are met:
42  * 1. Redistributions of source code must retain the above copyright
43  *    notice, this list of conditions and the following disclaimer.
44  * 2. Redistributions in binary form must reproduce the above copyright
45  *    notice, this list of conditions and the following disclaimer in the
46  *    documentation and/or other materials provided with the distribution.
47  * 3. Neither the name of the University nor the names of its contributors
48  *    may be used to endorse or promote products derived from this software
49  *    without specific prior written permission.
50  *
51  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61  * SUCH DAMAGE.
62  */
63 
64 /* Subroutines to generate sort keys. */
65 
66 #include "sort.h"
67 
68 __RCSID("$NetBSD: fields.c,v 1.33 2013/01/20 10:12:58 apb Exp $");
69 
70 #define SKIP_BLANKS(ptr) {                                            \
71           if (BLANK & d_mask[*(ptr)])                                 \
72                     while (BLANK & d_mask[*(++(ptr))]);               \
73 }
74 
75 #define NEXTCOL(pos) {                                                          \
76           if (!SEP_FLAG)                                                        \
77                     while (BLANK & l_d_mask[*(++pos)]);               \
78           while ((*(pos+1) != '\0') && !((FLD_D | REC_D_F) & l_d_mask[*++pos]));\
79 }
80 
81 static u_char *enterfield(u_char *, const u_char *, struct field *, int);
82 static u_char *number(u_char *, const u_char *, u_char *, u_char *, int);
83 static u_char *length(u_char *, const u_char *, u_char *, u_char *, int);
84 
85 #define DECIMAL_POINT '.'
86 
87 /*
88  * constructs sort key with leading recheader, followed by the key,
89  * followed by the original line.
90  */
91 length_t
enterkey(RECHEADER * keybuf,const u_char * keybuf_end,u_char * line_data,size_t line_size,struct field fieldtable[])92 enterkey(RECHEADER *keybuf, const u_char *keybuf_end, u_char *line_data,
93     size_t line_size, struct field fieldtable[])
94           /* keybuf:           pointer to start of key */
95 {
96           int i;
97           u_char *l_d_mask;
98           u_char *lineend, *pos;
99           const u_char *endkey;
100           u_char *keypos;
101           struct coldesc *clpos;
102           int col = 1;
103           struct field *ftpos;
104 
105           l_d_mask = d_mask;
106           pos = line_data - 1;
107           lineend = line_data + line_size-1;
108                                         /* don't include rec_delimiter */
109 
110           for (i = 0; i < ncols; i++) {
111                     clpos = clist + i;
112                     for (; (col < clpos->num) && (pos < lineend); col++) {
113                               NEXTCOL(pos);
114                     }
115                     if (pos >= lineend)
116                               break;
117                     clpos->start = SEP_FLAG ? pos + 1 : pos;
118                     NEXTCOL(pos);
119                     clpos->end = pos;
120                     col++;
121                     if (pos >= lineend) {
122                               clpos->end = lineend;
123                               i++;
124                               break;
125                     }
126           }
127           for (; i <= ncols; i++)
128                     clist[i].start = clist[i].end = lineend;
129           if (clist[0].start < line_data)
130                     clist[0].start++;
131 
132           /*
133            * We write the sort keys (concatenated) followed by the
134            * original line data (for output) as the 'keybuf' data.
135            * keybuf->length is the number of key bytes + data bytes.
136            * keybuf->offset is the number of key bytes.
137            * We add a record separator weight after the key in case
138            * (as is usual) we need to preserve the order of equal lines,
139            * and for 'sort -u'.
140            * The key itself will have had the correct weight applied.
141            */
142           keypos = keybuf->data;
143           endkey = keybuf_end - line_size - 1;
144           if (endkey <= keypos)
145                     /* No room for any key bytes */
146                     return 1;
147 
148           for (ftpos = fieldtable + 1; ftpos->icol.num; ftpos++) {
149                     if ((keypos = enterfield(keypos, endkey, ftpos,
150                         fieldtable->flags)) == NULL)
151                               return (1);
152           }
153 
154           keybuf->offset = keypos - keybuf->data;
155           keybuf->length = keybuf->offset + line_size;
156 
157           /*
158            * Posix requires that equal keys be further sorted by the
159            * entire original record.
160            * NetBSD has (at least for some time) kept equal keys in
161            * their original order.
162            * For 'sort -u' posix_sort is unset.
163            */
164           keybuf->keylen = posix_sort ? keybuf->length : keybuf->offset;
165 
166           memcpy(keypos, line_data, line_size);
167           return (0);
168 }
169 
170 /*
171  * constructs a field (as defined by -k) within a key
172  */
173 static u_char *
enterfield(u_char * tablepos,const u_char * endkey,struct field * cur_fld,int gflags)174 enterfield(u_char *tablepos, const u_char *endkey, struct field *cur_fld,
175     int gflags)
176 {
177           u_char *start, *end, *lineend, *mask, *lweight;
178           struct column icol, tcol;
179           u_int flags;
180 
181           icol = cur_fld->icol;
182           tcol = cur_fld->tcol;
183           flags = cur_fld->flags;
184           start = icol.p->start;
185           lineend = clist[ncols].end;
186           if (flags & BI)
187                     SKIP_BLANKS(start);
188           start += icol.indent;
189           start = min(start, lineend);
190 
191           if (!tcol.num)
192                     end = lineend;
193           else {
194                     if (tcol.indent) {
195                               end = tcol.p->start;
196                               if (flags & BT)
197                                         SKIP_BLANKS(end);
198                               end += tcol.indent;
199                               end = min(end, lineend);
200                     } else
201                               end = tcol.p->end;
202           }
203 
204           if (flags & L)
205                     return length(tablepos, endkey, start, end, flags);
206           if (flags & N)
207                     return number(tablepos, endkey, start, end, flags);
208 
209           /* Bound check space - assuming nothing is skipped */
210           if (tablepos + (end - start) + 1 >= endkey)
211                     return NULL;
212 
213           mask = cur_fld->mask;
214           lweight = cur_fld->weights;
215           for (; start < end; start++) {
216                     if (!mask || mask[*start]) {
217                               *tablepos++ = lweight[*start];
218                     }
219           }
220           /* Add extra byte (absent from lweight) to sort short keys correctly */
221           *tablepos++ = lweight[REC_D];
222           return tablepos;
223 }
224 
225 /*
226  * Numbers are converted to a floating point format (exponent & mantissa)
227  * so that they compare correctly as sequence of unsigned bytes.
228  * Bytes 0x00 and 0xff are used to terminate positive and negative numbers
229  * to ensure that 0.123 sorts after 0.12 and -0.123 sorts before -0.12.
230  *
231  * The first byte contain the overall sign, exponent sign and some of the
232  * exponent. These have to be ordered (-ve value, decreasing exponent),
233  * zero, (+ve value, increasing exponent).
234  *
235  * The first byte is 0x80 for zero, 0xc0 for +ve with exponent 0.
236  * -ve values are the 1's compliments (so 0x7f isn't used!).
237  *
238  * This only leaves 63 byte values for +ve exponents - which isn't enough.
239  * The largest 4 exponent values are used to hold a byte count of the
240  * number of following bytes that contain 8 exponent bits per byte,
241  * This lets us sort exponents from -2^31 to +2^31.
242  *
243  * The mantissa is stored 2 digits per byte offset by 0x40, for negative
244  * numbers the order must be reversed (they are bit inverted).
245  *
246  * Reverse sorts are done by inverting the sign of the number.
247  */
248 #define MAX_EXP_ENC  ((int)sizeof(int))
249 
250 static u_char *
number(u_char * pos,const u_char * bufend,u_char * line,u_char * lineend,int reverse)251 number(u_char *pos, const u_char *bufend, u_char *line, u_char *lineend,
252     int reverse)
253 {
254           int exponent = -1;
255           int had_dp = 0;
256           u_char *tline;
257           char ch;
258           unsigned int val;
259           u_char *last_nz_pos;
260           u_char negate;
261 
262           if (reverse & R)
263                     negate = 0xff;
264           else
265                     negate = 0;
266 
267           /* Give ourselves space for the key terminator */
268           bufend--;
269 
270           /* Ensure we have enough space for the exponent */
271           if (pos + 1 + MAX_EXP_ENC > bufend)
272                     return (NULL);
273 
274           SKIP_BLANKS(line);
275           if (*line == '-') { /* set the sign */
276                     negate ^= 0xff;
277                     line++;
278           } else if (*line == '+') {
279                     line++;
280           }
281 
282           /* eat initial zeroes */
283           for (; *line == '0' && line < lineend; line++)
284                     continue;
285 
286           /* calculate exponents */
287           if (*line == DECIMAL_POINT) {
288                     /* Decimal fraction */
289                     had_dp = 1;
290                     while (*++line == '0' && line < lineend)
291                               exponent--;
292           } else {
293                     /* Large (absolute) value, count digits */
294                     for (tline = line; *tline >= '0' &&
295                         *tline <= '9' && tline < lineend; tline++)
296                               exponent++;
297           }
298 
299           /* If the first/next character isn't a digit, value is zero */
300           if (*line < '1' || *line > '9' || line >= lineend) {
301                     /* This may be "0", "0.00", "000" or "fubar" but sorts as 0 */
302                     /* XXX what about NaN, NAN, inf and INF */
303                     *pos++ = 0x80;
304                     return pos;
305           }
306 
307           /* Maybe here we should allow for e+12 (etc) */
308 
309           if (exponent < 0x40 - MAX_EXP_ENC && -exponent < 0x40 - MAX_EXP_ENC) {
310                     /* Value ok for simple encoding */
311                     /* exponent 0 is 0xc0 for +ve numbers and 0x40 for -ve ones */
312                     exponent += 0xc0;
313                     *pos++ = negate ^ exponent;
314           } else {
315                     /* Out or range for a single byte */
316                     int c, t;
317                     t = exponent > 0 ? exponent : -exponent;
318                     /* Count how many 8-bit bytes are needed */
319                     for (c = 0; ; c++) {
320                               t >>= 8;
321                               if (t == 0)
322                                         break;
323                     }
324                     /* 'c' better be 0..3 here - but probably 0..1 */
325                     /* Offset just outside valid range */
326                     t = c + 0x40 - MAX_EXP_ENC;
327                     if (exponent < 0)
328                               t = -t;
329                     *pos++ = negate ^ (t + 0xc0);
330                     /* now add each byte, most significant first */
331                     for (; c >= 0; c--)
332                               *pos++ = negate ^ (exponent >> (c * 8));
333           }
334 
335           /* Finally add mantissa, 2 digits per byte */
336           for (last_nz_pos = pos; line < lineend; ) {
337                     if (pos >= bufend)
338                               return NULL;
339                     ch = *line++;
340                     val = (ch - '0') * 10;
341                     if (val > 90) {
342                               if (ch == DECIMAL_POINT && !had_dp) {
343                                         had_dp = 1;
344                                         continue;
345                               }
346                               break;
347                     }
348                     while (line < lineend) {
349                               ch = *line++;
350                               if (ch == DECIMAL_POINT && !had_dp) {
351                                         had_dp = 1;
352                                         continue;
353                               }
354                               if (ch < '0' || ch > '9')
355                                         line = lineend;
356                               else
357                                         val += ch - '0';
358                               break;
359                     }
360                     *pos++ = negate ^ (val + 0x40);
361                     if (val != 0)
362                               last_nz_pos = pos;
363           }
364 
365           /* Add key terminator, deleting any trailing "00" */
366           *last_nz_pos++ = negate;
367 
368           return (last_nz_pos);
369 }
370 
371 static u_char *
length(u_char * pos,const u_char * bufend,u_char * line,u_char * lineend,int flag)372 length(u_char *pos, const u_char *bufend, u_char *line, u_char *lineend,
373     int flag)
374 {
375           u_char buf[32];
376           int l;
377           SKIP_BLANKS(line);
378           l = snprintf((char *)buf, sizeof(buf), "%td", lineend - line);
379           return number(pos, bufend, buf, buf + l, flag);
380 }
381