View | Details | Raw Unified | Return to bug 218951
Collapse All | Expand All

(-)common/utext.cpp (-5 / +22 lines)
Lines 847-855 Link Here
847
//------------------------------------------------------------------------------
847
//------------------------------------------------------------------------------
848
848
849
// Chunk size.
849
// Chunk size.
850
//     Must be less than 85, because of byte mapping from UChar indexes to native indexes.
850
//     Must be less than 42  (256/6), because of byte mapping from UChar indexes to native indexes.
851
//     Worst case is three native bytes to one UChar.  (Supplemenaries are 4 native bytes
851
//     Worst case there are six UTF-8 bytes per UChar.
852
//     to two UChars.)
852
//         obsolete 6 byte form fd + 5 trails maps to fffd
853
//         obsolete 5 byte form fc + 4 trails maps to fffd
854
//         non-shortest 4 byte forms maps to fffd
855
//         normal supplementaries map to a pair of utf-16, two utf8 bytes per utf-16 unit
856
//     mapToUChars array size must allow for the worst case, 6.
857
//     This could be brought down to 4, by treating fd and fc as pure illegal,
858
//     rather than obsolete lead bytes. But that is not compatible with the utf-8 access macros.
853
//
859
//
854
enum { UTF8_TEXT_CHUNK_SIZE=32 };
860
enum { UTF8_TEXT_CHUNK_SIZE=32 };
855
861
Lines 889-895 Link Here
889
                                                     //  Requires two extra slots,
895
                                                     //  Requires two extra slots,
890
                                                     //    one for a supplementary starting in the last normal position,
896
                                                     //    one for a supplementary starting in the last normal position,
891
                                                     //    and one for an entry for the buffer limit position.
897
                                                     //    and one for an entry for the buffer limit position.
892
    uint8_t   mapToUChars[UTF8_TEXT_CHUNK_SIZE*3+6]; // Map native offset from bufNativeStart to
898
    uint8_t   mapToUChars[UTF8_TEXT_CHUNK_SIZE*6+6]; // Map native offset from bufNativeStart to
893
                                                     //   correspoding offset in filled part of buf.
899
                                                     //   correspoding offset in filled part of buf.
894
    int32_t   align;
900
    int32_t   align;
895
};
901
};
Lines 1032-1037 Link Here
1032
            // Requested index is in this buffer.
1038
            // Requested index is in this buffer.
1033
            u8b = (UTF8Buf *)ut->p;   // the current buffer
1039
            u8b = (UTF8Buf *)ut->p;   // the current buffer
1034
            mapIndex = ix - u8b->toUCharsMapStart;
1040
            mapIndex = ix - u8b->toUCharsMapStart;
1041
            U_ASSERT(mapIndex < (int32_t)sizeof(UTF8Buf::mapToUChars));
1035
            ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1042
            ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1036
            return TRUE;
1043
            return TRUE;
1037
1044
Lines 1298-1303 Link Here
1298
        // Can only do this if the incoming index is somewhere in the interior of the string.
1305
        // Can only do this if the incoming index is somewhere in the interior of the string.
1299
        //   If index is at the end, there is no character there to look at.
1306
        //   If index is at the end, there is no character there to look at.
1300
        if (ix != ut->b) {
1307
        if (ix != ut->b) {
1308
            // Note: this function will only move the index back if it is on a trail byte
1309
            //       and there is a preceding lead byte and the sequence from the lead
1310
            //       through this trail could be part of a valid UTF-8 sequence
1311
            //       Otherwise the index remains unchanged.
1301
            U8_SET_CP_START(s8, 0, ix);
1312
            U8_SET_CP_START(s8, 0, ix);
1302
        }
1313
        }
1303
1314
Lines 1311-1317 Link Here
1311
        UChar   *buf = u8b->buf;
1322
        UChar   *buf = u8b->buf;
1312
        uint8_t *mapToNative = u8b->mapToNative;
1323
        uint8_t *mapToNative = u8b->mapToNative;
1313
        uint8_t *mapToUChars = u8b->mapToUChars;
1324
        uint8_t *mapToUChars = u8b->mapToUChars;
1314
        int32_t  toUCharsMapStart = ix - (UTF8_TEXT_CHUNK_SIZE*3 + 1);
1325
        int32_t  toUCharsMapStart = ix - sizeof(UTF8Buf::mapToUChars) + 1;
1326
        // Note that toUCharsMapStart can be negative. Happens when the remaining
1327
        // text from current position to the beginning is less than the buffer size.
1328
        // + 1 because mapToUChars must have a slot at the end for the bufNativeLimit entry.
1315
        int32_t  destIx = UTF8_TEXT_CHUNK_SIZE+2;   // Start in the overflow region
1329
        int32_t  destIx = UTF8_TEXT_CHUNK_SIZE+2;   // Start in the overflow region
1316
                                                    //   at end of buffer to leave room
1330
                                                    //   at end of buffer to leave room
1317
                                                    //   for a surrogate pair at the
1331
                                                    //   for a surrogate pair at the
Lines 1338-1343 Link Here
1338
            if (c<0x80) {
1352
            if (c<0x80) {
1339
                // Special case ASCII range for speed.
1353
                // Special case ASCII range for speed.
1340
                buf[destIx] = (UChar)c;
1354
                buf[destIx] = (UChar)c;
1355
                U_ASSERT(toUCharsMapStart <= srcIx);
1341
                mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx;
1356
                mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx;
1342
                mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
1357
                mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
1343
            } else {
1358
            } else {
Lines 1367-1372 Link Here
1367
                do {
1382
                do {
1368
                    mapToUChars[sIx-- - toUCharsMapStart] = (uint8_t)destIx;
1383
                    mapToUChars[sIx-- - toUCharsMapStart] = (uint8_t)destIx;
1369
                } while (sIx >= srcIx);
1384
                } while (sIx >= srcIx);
1385
                U_ASSERT(toUCharsMapStart <= (srcIx+1));
1370
1386
1371
                // Set native indexing limit to be the current position.
1387
                // Set native indexing limit to be the current position.
1372
                //   We are processing a non-ascii, non-native-indexing char now;
1388
                //   We are processing a non-ascii, non-native-indexing char now;
Lines 1541-1546 Link Here
1541
    U_ASSERT(index>=ut->chunkNativeStart+ut->nativeIndexingLimit);
1557
    U_ASSERT(index>=ut->chunkNativeStart+ut->nativeIndexingLimit);
1542
    U_ASSERT(index<=ut->chunkNativeLimit);
1558
    U_ASSERT(index<=ut->chunkNativeLimit);
1543
    int32_t mapIndex = index - u8b->toUCharsMapStart;
1559
    int32_t mapIndex = index - u8b->toUCharsMapStart;
1560
    U_ASSERT(mapIndex < (int32_t)sizeof(UTF8Buf::mapToUChars));
1544
    int32_t offset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1561
    int32_t offset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1545
    U_ASSERT(offset>=0 && offset<=ut->chunkLength);
1562
    U_ASSERT(offset>=0 && offset<=ut->chunkLength);
1546
    return offset;
1563
    return offset;
(-)test/intltest/utxttest.cpp (+62 lines)
Lines 67-72 Link Here
67
            if (exec) Ticket10983();  break;
67
            if (exec) Ticket10983();  break;
68
        case 7: name = "Ticket12130";
68
        case 7: name = "Ticket12130";
69
            if (exec) Ticket12130(); break;
69
            if (exec) Ticket12130(); break;
70
        case 8: name = "Ticket12888";
71
            if (exec) Ticket12888(); break;
70
        default: name = "";          break;
72
        default: name = "";          break;
71
    }
73
    }
72
}
74
}
Lines 1583-1585 Link Here
1583
    }
1585
    }
1584
    utext_close(&ut);
1586
    utext_close(&ut);
1585
}
1587
}
1588
1589
// Ticket 12888: bad handling of illegal utf-8 containing many instances of the archaic, now illegal,
1590
//               six byte utf-8 forms. Original implementation had an assumption that
1591
//               there would be at most three utf-8 bytes per UTF-16 code unit.
1592
//               The five and six byte sequences map to a single replacement character.
1593
1594
void UTextTest::Ticket12888() {
1595
    const char *badString =
1596
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1597
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1598
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1599
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1600
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1601
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1602
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1603
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1604
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1605
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1606
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1607
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1608
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1609
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1610
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1611
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1612
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1613
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1614
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
1615
            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80";
1616
1617
    UErrorCode status = U_ZERO_ERROR;
1618
    LocalUTextPointer ut(utext_openUTF8(NULL, badString, -1, &status));
1619
    TEST_SUCCESS(status);
1620
    for (;;) {
1621
        UChar32 c = utext_next32(ut.getAlias());
1622
        if (c == U_SENTINEL) {
1623
            break;
1624
        }
1625
    }
1626
    int32_t endIdx = utext_getNativeIndex(ut.getAlias());
1627
    if (endIdx != (int32_t)strlen(badString)) {
1628
        errln("%s:%d expected=%d, actual=%d", __FILE__, __LINE__, strlen(badString), endIdx);
1629
        return;
1630
    }
1631
1632
    for (int32_t prevIndex = endIdx; prevIndex>0;) {
1633
        UChar32 c = utext_previous32(ut.getAlias());
1634
        int32_t currentIndex = utext_getNativeIndex(ut.getAlias());
1635
        if (c != 0xfffd) {
1636
            errln("%s:%d (expected, actual, index) = (%d, %d, %d)\n",
1637
                    __FILE__, __LINE__, 0xfffd, c, currentIndex);
1638
            break;
1639
        }
1640
        if (currentIndex != prevIndex - 6) {
1641
            errln("%s:%d: wrong index. Expected, actual = %d, %d",
1642
                    __FILE__, __LINE__, prevIndex - 6, currentIndex);
1643
            break;
1644
        }
1645
        prevIndex = currentIndex;
1646
    }
1647
}
(-)test/intltest/utxttest.h (+1 lines)
Lines 38-43 Link Here
38
    void Ticket10562();
38
    void Ticket10562();
39
    void Ticket10983();
39
    void Ticket10983();
40
    void Ticket12130();
40
    void Ticket12130();
41
    void Ticket12888();
41
42
42
private:
43
private:
43
    struct m {                              // Map between native indices & code points.
44
    struct m {                              // Map between native indices & code points.

Return to bug 218951