diff -u -N -r memtest86/Makefile memtest86.RvR/Makefile --- memtest86/Makefile Tue Feb 1 01:30:53 2000 +++ memtest86.RvR/Makefile Wed Apr 5 16:31:34 2000 @@ -33,7 +33,7 @@ REL_TXT_ADR=0x108800 REL_DAT_ADR=0x10e800 -OBJS=head.o main.o test.o init.o lib.o +OBJS=head.o main.o test.o init.o lib.o patn.o OBJDUMP=objdump -k -q OBJCOPY=objcopy -O binary -R .note -R .comment -R .stab -R .stabstr @@ -52,6 +52,9 @@ lib.o: lib.c test.h defs.h io.h serial.h cc -c $(CCFLAGS) lib.c + +patn.o: patn.c + cc -c $(CCFLAGS) patn.c relo: $(OBJS) ld -m elf_i386 -o $@ -e do_test -Ttext $(REL_TXT_ADR) -Tdata \ diff -u -N -r memtest86/README memtest86.RvR/README --- memtest86/README Tue Feb 1 01:30:53 2000 +++ memtest86.RvR/README Sat Apr 8 11:34:35 2000 @@ -300,6 +300,7 @@ 2) Refresh mode 3) Test selection 4) Address Range + 5) Printing Mode SP Set scroll lock (Stops scrolling of error messages) Note: Testing is stalled when the scroll lock is @@ -333,9 +334,15 @@ series when a sweep of memory has been completed. Four to eight sweeps of memory are done for each pattern depending of the test being executed. -The following information is displayed when a memory error is detected. -An error message is only displayed for errors with a different address or -failing bit pattern. All displayed values are in hexadecimal. +There are two modes of printing the results. By default, the detailed printing +mode is used. The other mode collects errors into BadRAM patterns to be used +as parameters to a booting Linux kernel. It is possible, at any time during +the tests, to change the printing mode. + +In the detailed printing mode, the following information is displayed when a +memory error is detected. An error message is only displayed for errors with a +different address or failing bit pattern. All displayed values are in +hexadecimal. Addr: Failing memory address Good: Current data pattern @@ -345,9 +352,22 @@ Count: Number of consecutive errors with the same address and failing bits +In the pattern printing mode, Lines are printed in a form badram=F1,M1,F2,M2. +In each F/M pair, the F represents a fault address, and the corresponding M is +a bitmask for that address. These patterns state that faults have occurred in +addresses that equal F on all "1" bits in M. Such a pattern may capture more +errors that actually exist, but at least all the errors are captured. These +patterns have been designed to capture regular patterns of errors caused by the +hardware structure in a terse syntax. + +The BadRAM patterns are `grown' incrementally rather than `designed' from an +overview of all errors. The number of pairs is constrained to five for a number +of practical reasons. As a result, handcrafting patterns from the output in +address printing mode may, in exceptional cases, yield better results. + 10) Troubleshooting Memory Errors -================================ +================================= Once a memory error has been detected, determining the failing SIMM/DIMM module is not a clear cut procedure. With the large number of motherboard vendors and possible combinations of simm slots it would be difficult if @@ -377,6 +397,15 @@ If you are unable to use either of the previous techniques then you are left to selective replacement of modules to find the failure. +4) Avoiding allocation +The printing mode for BadRAM patterns is intended to construct boot time +parameters for a Linux kernel that is compiled with BadRAM support. This +work-around makes it possible for Linux to reliably run on your average damaged +RAM (or clearly panic if it cannot). For more information on BadRAM support +for Linux, sail to + + http://home.zonnet.nl/vanrein/badram + 11) Theory of Operation ======================= @@ -550,3 +582,5 @@ build.c are from the Linux 1.2.1 kernel and have been heavily modified. Doug Sisk provided code to support a console connected via a serial port. + +Rick van Rein provided code to construct and print BadRAM patterns. diff -u -N -r memtest86/init.c memtest86.RvR/init.c --- memtest86/init.c Fri Feb 11 23:28:53 2000 +++ memtest86.RvR/init.c Sat Apr 8 08:58:55 2000 @@ -198,6 +198,9 @@ } */ footer(); + + v->printmode=PRINTMODE_DETAILED; + v->numpatn=0; } /* check_ram - Determine if this address points to memory by checking diff -u -N -r memtest86/lib.c memtest86.RvR/lib.c --- memtest86/lib.c Tue Feb 1 01:30:53 2000 +++ memtest86.RvR/lib.c Sat Apr 8 08:58:09 2000 @@ -20,6 +20,7 @@ char save[POP_H][POP_W]; char buf[18]; + char *codes[] = { " Divide", " Debug", @@ -44,7 +45,7 @@ /* * Scroll the error message area of the screen - * Starts at line LINE_SCROLL and ends at line 24 + * Starts at line LINE_SCROLL and ends at line 23 */ void scroll() { int i, j; @@ -55,6 +56,12 @@ *(s-160) = *s; } } + /* Clear the newly opened line */ + s = (char *)(SCREEN_ADR + (23 * 160)); + for (j=0; j<80; j++) { + *s = ' '; + s += 2; + } } /* @@ -397,7 +404,7 @@ void get_config() { - int flag = 0, i; + int flag = 0, i, prt = 0; long addr; popup(); cprint(POP_Y+1, POP_X+2, "Configuration:"); @@ -405,7 +412,8 @@ cprint(POP_Y+4, POP_X+6, "(2) Refresh Timing"); cprint(POP_Y+5, POP_X+6, "(3) Test Selection"); cprint(POP_Y+6, POP_X+6, "(4) Address Range"); - cprint(POP_Y+7, POP_X+6, "(0) Cancel"); + cprint(POP_Y+7, POP_X+6, "(5) Printing Mode"); + cprint(POP_Y+8, POP_X+6, "(0) Cancel"); /* Wait for key release */ while ((get_key() & 0x80) == 0); @@ -504,7 +512,7 @@ cprint(POP_Y+5, POP_X+6, "(3) All Tests"); cprint(POP_Y+6, POP_X+6, "(4) Skip Current Test"); cprint(POP_Y+7, POP_X+6, "(5) Select Test"); - cprint(POP_Y+8, POP_X+6, "(0) Cancel"); + cprint(POP_Y+9, POP_X+6, "(0) Cancel"); if (v->testsel < 0) { cprint(POP_Y+3+v->xtst_flag, POP_X+5, ">"); } else { @@ -624,6 +632,35 @@ } } break; + case 6: + /* 5 - Printing Mode */ + popclear(); + cprint(POP_Y+1, POP_X+2, "Printing Mode:"); + cprint(POP_Y+3, POP_X+6, "(1) Detailed Errors"); + cprint(POP_Y+4, POP_X+6, "(2) BadRAM Patterns"); + cprint(POP_Y+5, POP_X+6, "(0) Cancel"); + cprint(POP_Y+3+v->printmode, POP_X+5, ">"); + while ((get_key() & 0x80) == 0); + while (!flag) { + switch(get_key()) { + case 2: + /* Detailed Errors */ + v->printmode=PRINTMODE_DETAILED; + flag++; + break; + case 3: + /* BadRAM Patterns */ + v->printmode=PRINTMODE_PATTERNS; + flag++; + prt++; + break; + case 11: + /* 0 - Cancel */ + flag++; + break; + } + } + break; case 11: /* 0 - Cancel */ flag++; @@ -631,7 +668,11 @@ } } popdown(); + if (prt) { + printpatn(); + } } + void popup() { diff -u -N -r memtest86/patn.c memtest86.RvR/patn.c --- memtest86/patn.c Thu Jan 1 01:00:00 1970 +++ memtest86.RvR/patn.c Fri Apr 7 23:57:27 2000 @@ -0,0 +1,141 @@ +/* Pattern extension for memtest86 + * + * Generates patterns for the Linux kernel's BadRAM extension that avoids + * allocation of faulty pages. + * + * Released under version 2 of the Gnu Public License. + * + * By Rick van Rein, vanrein@zonnet.nl + */ + + +#include "test.h" + + +/* + * DEFAULT_MASK covers a longword, since that is the testing granularity. + */ +#define DEFAULT_MASK ((~0L) << 2) + + +extern struct vars *v; + + +/* What it does: + * - Keep track of a number of BadRAM patterns in an array; + * - Combine new faulty addresses with it whenever possible; + * - Keep masks as selective as possible by minimising resulting faults; + * - Print a new pattern only when the pattern array is changed. + */ + + +/* Combine two adr/mask pairs to one adr/mask pair. + */ +void combine (ulong adr1, ulong mask1, ulong adr2, ulong mask2, + ulong *adr, ulong *mask) { + ulong adrxor=adr1 ^ adr2; + *mask = mask1 & mask2 & ~adrxor; + *adr = adr1 | adr2; + *adr &= *mask; // Normalise, no fundamental need for this +} + +/* Count the number of addresses covered with a mask. + */ +ulong addresses (ulong mask) { + ulong ctr=1; + int i=32; + while (i-- > 0) { + if (! (mask & 1)) { + ctr += ctr; + } + mask >>= 1; + } + return ctr; +} + +/* Count how much more addresses would be covered by adr1/mask1 when combined + * with adr2/mask2. + */ +ulong combicost (ulong adr1, ulong mask1, ulong adr2, ulong mask2) { + ulong cost1=addresses (mask1); + ulong tmp, mask; + combine (adr1, mask1, adr2, mask2, &tmp, &mask); + return addresses (mask) - cost1; +} + +/* Find the cheapest array index to extend with the given adr/mask pair. + * Return -1 if nothing below the given minimum cost can be found. + */ +int cheapindex (ulong adr1, ulong mask1, ulong mincost) { + int i=v->numpatn; + int idx=-1; + while (i-- > 0) { + ulong tmpcost=combicost(v->patn[i].adr, v->patn[i].mask, adr1, mask1); + if (tmpcost < mincost) { + mincost=tmpcost; + idx=i; + } + } + return idx; +} + +/* Try to find a relocation index for idx if it costs nothing. + * Return -1 if no such index exists. + */ +int relocateidx (int idx) { + ulong adr =v->patn[idx].adr; + ulong mask=v->patn[idx].mask; + int new; + v->patn[idx].adr ^= ~0L; // Never select idx + new=cheapindex (adr, mask, 1+addresses (mask)); + v->patn[idx].adr = adr; + return new; +} + +/* Relocate the given index idx only if free of charge. + * This is useful to combine to `neighbouring' sections to integrate. + * Inspired on the Buddy memalloc principle in the Linux kernel. + */ +void relocateiffree (int idx) { + int newidx=relocateidx (idx); + if (newidx>=0) { + ulong cadr, cmask; + combine (v->patn [newidx].adr, v->patn[newidx].mask, + v->patn [ idx].adr, v->patn[ idx].mask, + &cadr, &cmask); + v->patn[newidx].adr =cadr; + v->patn[newidx].mask=cmask; + if (idx < --v->numpatn) { + v->patn[idx].adr =v->patn[v->numpatn].adr; + v->patn[idx].mask=v->patn[v->numpatn].mask; + } + relocateiffree (newidx); + } +} + +/* Insert a single faulty address in the pattern array. + * Return 1 only if the array was changed. + */ +int insertaddress (ulong adr) { + if (v->numpatn < BADRAM_MAXPATNS) { + int idx=cheapindex (adr, DEFAULT_MASK, 1L); + if (idx == -1) { + v->patn[v->numpatn].adr =adr; + v->patn[v->numpatn].mask=DEFAULT_MASK; + v->numpatn++; + relocateiffree (v->numpatn-1); + return 1; + } else { + return 0; + } + } else { + int idx=cheapindex (adr, DEFAULT_MASK, ~0L); + ulong cadr, cmask; + combine (v->patn [idx].adr, v->patn[idx].mask, + adr, DEFAULT_MASK, &cadr, &cmask); + v->patn[idx].adr =cadr; + v->patn[idx].mask=cmask; + relocateiffree (idx); + return 1; + } +} diff -u -N -r memtest86/test.c memtest86.RvR/test.c --- memtest86/test.c Fri Feb 11 23:31:20 2000 +++ memtest86.RvR/test.c Sat Apr 8 08:58:32 2000 @@ -575,6 +575,10 @@ static int cnt = 0; long xor; char buf[20]; + int patnchg; + + /* Process the address in the pattern administration */ + patnchg=insertaddress ((unsigned long) adr); /* Check for keyboard input */ check_input(); @@ -588,37 +592,91 @@ return; } - /* Advance line for error message - * If at the bottom of the screen, scroll */ - if (v->msg_line < 23) { - v->msg_line++; - } else { - /* If scroll lock is on, loop till it is cleared */ - while (slock) { - check_input(); + if (v->printmode == PRINTMODE_DETAILED) { + + mkerrorline (); + + cnt = 1; + strcpy(buf,"Err - Addr:"); + buf[4] = v->test + '0'; + cprint(v->msg_line, 0, buf); + hprint(v->msg_line, 13, (long)adr); + cprint(v->msg_line, 23, "Good:"); + hprint(v->msg_line, 28, good); + cprint(v->msg_line, 38, "Bad:"); + hprint(v->msg_line, 42, bad); + cprint(v->msg_line, 52, "Xor:"); + hprint(v->msg_line, 56, xor); + cprint(v->msg_line, 66, "Count:"); + dprint(v->msg_line, 72, cnt, 5, 1); + + } else { + + /* printmode == PRINTMODE_PATTERNS */ + + if (patnchg) { + printpatn(); } - scroll(); - ttyprint(23,0,"\n"); - } - cnt = 1; - strcpy(buf,"Err - Addr:"); - buf[4] = v->test + '0'; - cprint(v->msg_line, 0, buf); - hprint(v->msg_line, 13, (long)adr); - cprint(v->msg_line, 23, "Good:"); - hprint(v->msg_line, 28, good); - cprint(v->msg_line, 38, "Bad:"); - hprint(v->msg_line, 42, bad); - cprint(v->msg_line, 52, "Xor:"); - hprint(v->msg_line, 56, xor); - cprint(v->msg_line, 66, "Count:"); - dprint(v->msg_line, 72, cnt, 5, 1); + } + v->eadr = adr; v->exor = xor; dprint(LINE_ERR, 72, ++(v->ecount), 6, 1); } + +/* Print the pattern array as a LILO boot option addressing BadRAM support. + */ +void printpatn (void) +{ + int idx=0; + int x; + + if (v->numpatn == 0) + return; + + mkerrorline (); + + cprint (v->msg_line, 0, "badram="); + x=7; + + for (idx = 0; idx < v->numpatn; idx++) { + + if (x > 80-22) { + mkerrorline (); + x=7; + } + cprint (v->msg_line, x, "0x"); + hprint (v->msg_line, x+2, v->patn[idx].adr ); + cprint (v->msg_line, x+10, ",0x"); + hprint (v->msg_line, x+13, v->patn[idx].mask); + if (idx+1 < v->numpatn) + cprint (v->msg_line, x+21, ","); + x+=22; + } +} + + +/* + * Make a line available for error printing. It'll be v->msg_line. + */ +void mkerrorline (void) +{ + /* If at the bottom of the screen, scroll */ + if (v->msg_line < 23) { + v->msg_line++; + } else { + /* If scroll lock is on, loop till it is cleared */ + while (slock) { + check_input(); + } + scroll(); + ttyprint(23,0,"\n"); + } +} + + /* * Display address error message. */ diff -u -N -r memtest86/test.h memtest86.RvR/test.h --- memtest86/test.h Fri Feb 11 23:31:34 2000 +++ memtest86.RvR/test.h Sat Apr 8 08:59:04 2000 @@ -48,6 +48,9 @@ lsr = serial_echo_inb(UART_LSR); \ } while ((lsr & BOTH_EMPTY) != BOTH_EMPTY) +int insertaddress(unsigned long); +void printpatn(void); +void mkerrorline(void); void itoa(char s[], int n); void reverse(char *p); void serial_echo_init(); @@ -77,6 +80,7 @@ void popdown(); void popclear(); void get_config(); +void get_printmode(); void cpu_type(); void addr_tst1(); void addr_tst2(); @@ -88,6 +92,18 @@ extern long idt_descr; extern long trap_regs[]; +#define PRINTMODE_DETAILED 0 +#define PRINTMODE_PATTERNS 1 + +#define BADRAM_MAXPATNS 5 +typedef unsigned long ulong; + +struct pair { + ulong adr; + ulong mask; +}; + + extern __inline__ void cache_off() { __asm__("push %eax\n\t" @@ -179,4 +195,7 @@ unsigned long startl; unsigned long snaph; unsigned long snapl; + int printmode; + int numpatn; + struct pair patn [BADRAM_MAXPATNS]; };