| Summary: | CD9660 unicode to utf-8 [hack] | ||||||||
|---|---|---|---|---|---|---|---|---|---|
| Product: | Base System | Reporter: | hsw <hsw> | ||||||
| Component: | kern | Assignee: | freebsd-bugs (Nobody) <bugs> | ||||||
| Status: | Closed FIXED | ||||||||
| Severity: | Affects Only Me | ||||||||
| Priority: | Normal | ||||||||
| Version: | 4.5-STABLE | ||||||||
| Hardware: | Any | ||||||||
| OS: | Any | ||||||||
| Attachments: |
|
||||||||
State Changed From-To: open->feedback State Changed From-To: feedback->closed In 2003, the following flag was added to mount_cd9660 (in FreeBSD 5.x and higher): -C charset Specify local charset to convert Unicode file names when using Joliet extensions. |
The cd9660 filing system cannot handle most unicode characters in file/directory names (especially Chinese/Japanese). This patch was a quick fix so I could retrieve files from such a CDROM, but it might be useful for some one who needs to read this kind of CDROM. Fix: Difficult since the isochar routine in sys/isofs/cd9660/cd9660_util.c assumes a 1:1 correspondence between the unicode characters and the native single byte character set. Most unicode chars are converted to '?'. Here is a quick work around for anyone who has a similar problem: The following hack adds a state machine to the isochar routine to fool the caller of isochar into getting multiple bytes per unicode char. The resulting bytes are utf-8 except for '?' '/' '%' space and control chars which are converted to '%' and two hex digits. Apply the patches in sys/isofs/cd9660/ to: cd9660_rrip.c cd9660_util.c iso.h /* If it's not the '.' entry of the root dir obey SP field */ if (c != 0 || isonum_733(isodir->extent) != ana->imp->root_extent) @@ -645,7 +645,7 @@ *outlen = 0; isochar(isodir->name, isodir->name + isonum_711(isodir->name_len), - imp->joliet_level, &c); + imp->joliet_level, &c, NULL); tab = rrip_table_getname; if (c == 0 || c == 1) { cd9660_rrip_defname(isodir,&analyze); - return 2; + return (utf_state == NULL || *utf_state == 0) ? 2 : 0; } /* @@ -101,12 +150,13 @@ int joliet_level; { int i, j; + int utf_state = 0; u_char c, *fnend = fn + fnlen, *isoend = isofn + isolen; for (; fn != fnend; fn++) { if (isofn == isoend) return *fn; - isofn += isochar(isofn, isoend, joliet_level, &c); + isofn += isochar(isofn, isoend, joliet_level, &c, &utf_state); if (c == ';') { if (*fn++ != ';') return fn[-1]; @@ -117,7 +167,7 @@ } for (j = 0; isofn != isoend; j = j * 10 + c - '0') isofn += isochar(isofn, isoend, - joliet_level, &c); + joliet_level, &c, &utf_state); return i - j; } if (c != *fn) { @@ -133,13 +183,13 @@ } } if (isofn != isoend) { - isofn += isochar(isofn, isoend, joliet_level, &c); + isofn += isochar(isofn, isoend, joliet_level, &c, &utf_state); switch (c) { default: return -c; case '.': if (isofn != isoend) { - isochar(isofn, isoend, joliet_level, &c); + isochar(isofn, isoend, joliet_level, &c, &utf_state); if (c == ';') return 0; } @@ -165,6 +215,7 @@ int joliet_level; { int fnidx = 0; + int utf_state = 0; u_char c, d = '\0', *infnend = infn + infnlen; if (assoc) { @@ -172,7 +223,7 @@ fnidx++; } for (; infn != infnend; fnidx++) { - infn += isochar(infn, infnend, joliet_level, &c); + infn += isochar(infn, infnend, joliet_level, &c, &utf_state); if (!original && !joliet_level && c >= 'A' && c <= 'Z') *outfn++ = c + ('a' - 'A'); -int isochar __P((u_char *, u_char *, int, u_char *)); +int isochar __P((u_char *, u_char *, int, u_char *, int *)); int isofncmp __P((u_char *, int, u_char *, int, int)); void isofntrans __P((u_char *, int, u_char *, u_short *, int, int, int)); ino_t isodirino __P((struct iso_directory_record *, struct iso_mnt *));--l5sNYkFLihkCn7EmE4K3FADDklkstcfl0o26QhjOUK4KaHyZ Content-Type: text/plain; name="file.diff" Content-Transfer-Encoding: 7bit Content-Disposition: attachment; filename="file.diff" --- cd9660_rrip.c.orig Sat Aug 28 08:46:06 1999 +++ cd9660_rrip.c Tue Apr 9 10:28:19 2002 @@ -508,7 +508,7 @@ pwhead = isodir->name + isonum_711(isodir->name_len); if (!(isonum_711(isodir->name_len)&1)) pwhead++; - isochar(isodir->name, pwhead, ana->imp->joliet_level, &c); + isochar(isodir->name, pwhead, ana->imp->joliet_level, &c, NULL); How-To-Repeat: A CDROM burned by Nero Burning ROM (Chinese file names) mount -t cd9660 -o ro /dev/cd0a /cdrom ls -l /cdrom See all files as ????? (various numbers of '?') it is only possible to access one file/directory from each set that has the same number of question marks.