Bug 36983

Summary: CD9660 unicode to utf-8 [hack]
Product: Base System Reporter: hsw <hsw>
Component: kernAssignee: freebsd-bugs (Nobody) <bugs>
Status: Closed FIXED    
Severity: Affects Only Me    
Priority: Normal    
Version: 4.5-STABLE   
Hardware: Any   
OS: Any   
Attachments:
Description Flags
file.diff
none
file.diff none

Description hsw 2002-04-11 03:40:01 UTC
        The cd9660 filing system cannot handle most unicode characters
	in file/directory names (especially Chinese/Japanese).

	This patch was a quick fix so I could retrieve files from
	such a CDROM, but it might be useful for some one who needs to
	read this kind of CDROM.

Fix: Difficult since the isochar routine in sys/isofs/cd9660/cd9660_util.c
	assumes a 1:1 correspondence between the unicode characters and the
	native single byte character set.  Most unicode chars are converted
	to '?'.

	Here is a quick work around for anyone who has a similar problem:
	The following hack adds a state machine to the isochar routine
	to fool the caller of isochar into getting multiple bytes per
	unicode char.

	The resulting bytes are utf-8 except for '?' '/' '%' space
	and control chars which are converted to '%' and two hex digits.

	Apply the patches in sys/isofs/cd9660/ to:
	  cd9660_rrip.c
	  cd9660_util.c
	  iso.h


/* If it's not the '.' entry of the root dir obey SP field */
 	if (c != 0 || isonum_733(isodir->extent) != ana->imp->root_extent)
@@ -645,7 +645,7 @@
 	*outlen = 0;

 	isochar(isodir->name, isodir->name + isonum_711(isodir->name_len),
-		imp->joliet_level, &c);
+		imp->joliet_level, &c, NULL);
 	tab = rrip_table_getname;
 	if (c == 0 || c == 1) {
 		cd9660_rrip_defname(isodir,&analyze);



-      return 2;
+      return (utf_state == NULL || *utf_state == 0) ? 2 : 0;
 }

 /*
@@ -101,12 +150,13 @@
 	int joliet_level;
 {
 	int i, j;
+        int utf_state = 0;
 	u_char c, *fnend = fn + fnlen, *isoend = isofn + isolen;

 	for (; fn != fnend; fn++) {
 		if (isofn == isoend)
 			return *fn;
-		isofn += isochar(isofn, isoend, joliet_level, &c);
+		isofn += isochar(isofn, isoend, joliet_level, &c, &utf_state);
 		if (c == ';') {
 			if (*fn++ != ';')
 				return fn[-1];
@@ -117,7 +167,7 @@
 			}
 			for (j = 0; isofn != isoend; j = j * 10 + c - '0')
 				isofn += isochar(isofn, isoend,
-						 joliet_level, &c);
+						 joliet_level, &c, &utf_state);
 			return i - j;
 		}
 		if (c != *fn) {
@@ -133,13 +183,13 @@
 		}
 	}
 	if (isofn != isoend) {
-		isofn += isochar(isofn, isoend, joliet_level, &c);
+		isofn += isochar(isofn, isoend, joliet_level, &c, &utf_state);
 		switch (c) {
 		default:
 			return -c;
 		case '.':
 			if (isofn != isoend) {
-				isochar(isofn, isoend, joliet_level, &c);
+				isochar(isofn, isoend, joliet_level, &c, &utf_state);
 				if (c == ';')
 					return 0;
 			}
@@ -165,6 +215,7 @@
 	int joliet_level;
 {
 	int fnidx = 0;
+	int utf_state = 0;
 	u_char c, d = '\0', *infnend = infn + infnlen;

 	if (assoc) {
@@ -172,7 +223,7 @@
 		fnidx++;
 	}
 	for (; infn != infnend; fnidx++) {
-		infn += isochar(infn, infnend, joliet_level, &c);
+		infn += isochar(infn, infnend, joliet_level, &c, &utf_state);

 		if (!original && !joliet_level && c >= 'A' && c <= 'Z')
 			*outfn++ = c + ('a' - 'A');





-int isochar __P((u_char *, u_char *, int, u_char *));
+int isochar __P((u_char *, u_char *, int, u_char *, int *));
 int isofncmp __P((u_char *, int, u_char *, int, int));
 void isofntrans __P((u_char *, int, u_char *, u_short *, int, int, int));
 ino_t isodirino __P((struct iso_directory_record *, struct iso_mnt *));--l5sNYkFLihkCn7EmE4K3FADDklkstcfl0o26QhjOUK4KaHyZ
Content-Type: text/plain; name="file.diff"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment; filename="file.diff"

--- cd9660_rrip.c.orig	Sat Aug 28 08:46:06 1999
+++ cd9660_rrip.c	Tue Apr  9 10:28:19 2002
@@ -508,7 +508,7 @@
 	pwhead = isodir->name + isonum_711(isodir->name_len);
 	if (!(isonum_711(isodir->name_len)&1))
 		pwhead++;
-	isochar(isodir->name, pwhead, ana->imp->joliet_level, &c);
+	isochar(isodir->name, pwhead, ana->imp->joliet_level, &c, NULL);
How-To-Repeat: 	A CDROM burned by Nero Burning ROM (Chinese file names)

        mount -t cd9660 -o ro /dev/cd0a /cdrom
	ls -l /cdrom

        See all files as ????? (various numbers of '?')
        it is only possible to access one file/directory from each set
        that has the same number of question marks.
Comment 1 Craig Rodrigues freebsd_committer freebsd_triage 2005-10-17 15:55:21 UTC
State Changed
From-To: open->feedback
Comment 2 Craig Rodrigues freebsd_committer freebsd_triage 2005-10-17 15:58:31 UTC
State Changed
From-To: feedback->closed

In 2003, the following flag was added to mount_cd9660 (in FreeBSD 5.x and higher): 
-C charset 
Specify local charset to convert Unicode file names when using 
Joliet extensions.