;;; experimentation in encoding polynomials for faster multiplication.
;;; Richard Fateman

;;;; REVISED SEPTEMBER, 2008 to try using displaced arrays.
;;;; see polymultk2.

;;; September, 2003
;;; This file just does polynomial mult of univariate arrays of bignums.
;;; The variable is NOT mentioned anywhere.



;;; More notes, added 4/15/05 RJF
;;; This is simpler for certain implementations of algorithms.
;;; In particular, using a Karatsuba or Toom-Cook splitting.
;;; If your polynomial is  #(x 10 20 30 40) then half-splitting
;;; means creating  #(x 10 20)  and #(x 30 40).  If your polynomial
;;; is #(10 20 30 40) then the splitting can be done by indexing
;;; into the array or vector to get #(10 20) and #(30 40), and
;;; no new data. Or as we are doing here, by using subseq to pick out
;;; subsequences. It would be kind of ugly to pick out a subsequence
;;; and then paste a header on the front of it.

;;; Also, we are exploring the programming of a Toom-Cook type
;;; algorithm which uses n^1.46 mults, by splitting into triples,
;;; i.e. ax^2+bx+c.


;;; How to encode zero? #(0) works better, I think, than #().
;;; A Normal polynomials should have no trailing zeros. not enforced.
;;; defect is it doesn't normalize so that  p-p  might be #(0 0 0 0...).

;; Example: #(10 30 21) means 10+30*x+21*x^2 = 10+x*(30+x*21)

(eval-when (compile load)
  (declaim (optimize (speed 3) (safety 0) (space 0) (compilation-speed 0)))
  (declaim (inline svref = eql make-array)))

(defun polymult(p q) ;; by ordinary poly multiplication.
  (declare (type (array integer (*)) p q))
  (assert (arrayp p))
  (assert (arrayp q))
  
  (let* ((plim (length p)) ;; degree is plim-1
	 (qlim (length q))
	 (p_i 0)
	 (ans(make-array (+ plim qlim -1) :initial-element 0)))
    (declare(fixnum qlim plim)(type (array integer (*)) ans))
    (do ((i 0 (1+ i)) )
	((= i plim) ans)
      (declare (fixnum i))
      (setf p_i (aref p i))
      
      (unless (= p_i 0)
      (do ((j 0 (1+ j)))
	  ((= j qlim) nil)
	(declare (fixnum j))
	(incf (aref ans (+ i j))
	      (*  p_i  ; (aref p i)
		  (aref q j))))))))	     

;;  compute a power of p 
(defun ppower(p n) (if (= n 1) p (polymult p (ppower p (1- n)))))

(defun ppowerk(p n) (if (= n 1) p (polymultk p (ppower p (1- n)))))

;;An implementation of Karatsuba multiplication

#|Assume p is of degree 2n-1,  i.e.  p= a*x^n+ b  where a is degree n-1 and b is degree n-1.
 assume q is also of degree 2n-1,  i.e.  q= c*x^n+d 

compute U= (a+b)*(c+d)  =  a*c+a*d+b*c+b*d,  a polynomial of degree k=2*(n-1) 
compute S= a*c   of degree k
compute T= b*d   of degree k
compute U = U-S-T = a*d+b*c of degree k

Then V=p*q = S*x^(2n)+U*x^n + T.

from first principles,  V =p*q should be of degree 2*(2n-1) = 4n-2. 
from the form above, k+2n= 4n-2.
note that U*x^n is of degree 2*(n-1)+n =  3n-1 so that it OVERLAPS the S*x^(2n) component.
note that T     is of degree 2*(n-1)   =  2n-2 so that it OVERLAPS the U*x^(n)   component.


  (the trick is that, used recursively to multiply a*c etc,
    this saves coefficient mults, instead of nm, max(n,m)^1.5)

if p, q, are of degree (say) 10 or so, use conventional arithmetic. Just a guess.
What about this 2n-1 business? Find the max degree of p, q.  If it is odd, we
are set.  If it is even, 2n, do this:
  p=a*x^n+ b  where b is of degree n-1, but a is of degree n. Same deal works. I think.
|#

;;(defvar *karatlim* 12)
(defparameter *karatlim* 40)  ;; an educated guess -- don't use karatsuba for small probs.

(defun polymultk(pp qq);; by Karatsuba.
  (assert (arrayp pp))
  (assert (arrayp qq))
  (let* ((plim (length pp)) 
	 (qlim (length qq)))
    (declare (fixnum qlim plim)(type (array t) aa pp qq))
    (if (< (min plim qlim) *karatlim*)(polymult pp qq); don't use for small polys
      (let* (aa
	     (h (truncate (max plim qlim) 2))
	     (A (subseq pp (min h plim) plim))
	     (B (subseq pp 0 (min h plim)))
	     (C (subseq qq (min h qlim) qlim))
	     (D (subseq qq 0 (min h qlim)))
	     (U (polymultk(polyadd A B) (polyadd C D)))
	     (TT (polymultk A C))
	     (S (polymultk B D)))
	(polysub-into S U 0)		; change U to  U-S
	(polysub-into TT U 0)		; change U to  U-S-T
	(setf aa (polyshiftleft TT (ash h 1))) ;aa = T*x^(2h)
	(polyadd-into U aa h)		; change aa to T*x^(2h)+U*x^h
	(polyadd-into S aa 0)		; change aa to T*x^(2h)+U*x^h +S
	(pnorm aa) ;remove trailing 0's if any
	))))



(defun polymultk2(pp qq);; by Karatsuba.,  use displaced arrays
  (assert (arrayp pp))
  (assert (arrayp qq))
  (let* ((plim (length pp)) 
	 (qlim (length qq)))
    (declare (fixnum qlim plim)(type (array t) aa pp qq))
    (if (or (< plim *karatlim*)
	    (< qlim *karatlim*))
	(polymult pp qq)		; don't use for small polys

      (let* ((aa nil)
	     (h (ash (min plim qlim) -1))
	     (A (make-array (- plim h)  :displaced-to pp :displaced-index-offset (min h plim)))
	     (B (make-array (min h plim) :displaced-to pp))
	     (C (make-array (- qlim h) :displaced-to qq  :displaced-index-offset (min h qlim)))
	     (D (make-array (min h qlim) :displaced-to qq))
	     (U (polymultk2(polyadd A B) (polyadd C D)))
	     (TT (polymultk2 A C))
	     (S (polymultk2 B D)))
	(polysub-into S U 0)		; change U to  U-S
	(polysub-into TT U 0)		; change U to  U-S-T
	(setf aa (polyshiftleft TT (ash h 1))) ;aa = T*x^(2h)
	(polyadd-into U aa h)		; change aa to T*x^(2h)+U*x^h
	(polyadd-into S aa 0)		; change aa to T*x^(2h)+U*x^h +S
	(pnorm aa) ;remove trailing 0's if any
	))))

 	 
(defun polyshiftleft(p k)		; like multiplying by x^k
  ;; k is positive integer, p is a raw polynomial, no header
  (declare (type (array t (*)) p)(fixnum k))
  (let* ((oldlen (length p))
	 (newlen (+ oldlen k))
	 (newpoly (make-array newlen :initial-element 0)))
    (declare (fixnum newlen oldlen))
    (do ((i  (1- oldlen) (1- i)))
	((< i 0) newpoly)
      (declare (fixnum i k))
      (setf (aref newpoly (+ k i))(aref p i)))))


(defun polyadd(p q)
  (declare (type (array t (*)) p q))
  (let* ((plim (length p))(qlim (length q))
	(aa (make-array (max plim qlim) :initial-element 0)))
      (if (> plim qlim)
	  (polyadd-1 p q plim qlim aa)
	(polyadd-1 q p qlim plim aa))
      ;; we could normalize here, removing all trailing zeros but one
      ;; we would rather wait until we know we care about it.
      ;;(pnorm aa)
      ))

(defun polysub(p q) ;; subtraction. p-q
    (declare (type (array t (*)) p q))
  (let* ((plim (length p))(qlim (length q))
	(aa (make-array (max plim qlim) :initial-element 0)))
    (polysub-1 p q plim qlim aa)
      ;; we could normalize here, removing all trailing zeros but one
      ;; we would rather wait until we know we care about it.
      ;;(pnorm aa)
      ))
  
(defun polyadd-1 (p q pl ql aa) 
  ;; p is longer, length pl. same length as array aa, filled with zeros.
  ;; p and q are unchanged.
      (declare (type (array t (*)) p q aa)(fixnum pl ql))
  (do ((i 0 (1+ i)))
      ((= i pl))
    (declare (fixnum i))
    (setf (aref aa i)(aref p i)))	; copy longer input to answer
  (do ((i 0 (1+ i)))
      ((= i ql) aa)
    (declare (fixnum i))
    (incf (aref aa i)(aref q i))))

(defun polysub-1 (p q pl ql aa) 
  (declare (type (array t (*)) p q aa)(fixnum pl ql))	
  (do ((i 0 (1+ i)))
      ((= i pl))
    (declare (fixnum i))
    (setf (aref aa i)(aref p i)))	; copy first arg
  (do ((i 0 (1+ i)))
      ((= i ql) aa)
    (declare (fixnum i))
    (decf (aref aa i)(aref q i))))	; subtract second arg

;; defined in case p is not normalized
;; and has trailing zeros


(defun polyadd-into (p aa start)  ;; hacked up a bit..
  ;; p is shorter than answer aa 
  ;; p is unchanged.  aa is changed to aa+ p*x^start
  (let ((m 0))
  (declare (fixnum start) (type (array integer (*)) p aa) (integer m))
  (do ((i (+ (length p) start -1) (1- i)) )
      ((< i start) aa)
    (declare (fixnum i))
    (unless (= 0 (setf m (aref p (- i start))))
      (incf (aref aa i) m)))))

(defun polysub-into (p aa start)  ;; hacked up a bit..
  ;; p is shorter than answer aa 
  ;; p is unchanged.  aa is changed to aa+ p*x^start
  (let ((m 0))
  (declare (fixnum start) (type (array integer (*)) p aa) (integer m))
  (do ((i (+ (length p) start -1) (1- i)) )
      ((< i start) aa)
    (declare (fixnum i))
    (unless (= 0 (setf m (aref p (- i start))))
      (decf (aref aa i) m)))))


;; remove trailing zeros.
(defun pnorm (x)			; x is a vector
  (declare (optimize (speed 3)(safety 0)(debug 0)))
      (let ((x x) pos)
	(declare (simple-vector x) (fixnum pos)
		 (inline position-if-not coefzerofunp delete-if))
	(setq pos (position-if-not #'zerop x :from-end t))
	(cond
	 ((null pos) #(0))  ; change if we want #() to be zero!
	 ((= pos (1- (length x))) x)	; already normal
	 (t (subseq x 0 (1+ pos) )))))


;; benchmarks
#|
(setf ones (make-array 1000 :initial-element 1)) ;1,000
(setf bigs(make-array 1000 :initial-element (expt 2 40))); 1,000
(setf moreones(make-array 10000 :initial-element 1)) ;10,000
(setf short #(1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30))

(time (prog nil (polymult ones ones))) ;;62ms
(time (prog nil (polymultk ones ones)));; 16ms
(time (prog nil (polymult bigs bigs))) ;; 410ms
(time (prog nil (polymultk bigs bigs)));; 109ms
(time (prog nil (polymult moreones moreones))) ;; 5484ms
(time (prog nil (polymultk moreones moreones)));;  656ms


(time (prog nil (polymulttc moreones moreones))) ;; 812ms *toomlim* 32
(time (prog nil (polymulttc moreones moreones))) ;; 641ms *toomlim* 250


(time (prog nil (polymulttc ones ones))) ;;  15ms toomlim =16, rollover to mult
(time (prog nil (polymulttc ones ones))) ;;  32ms toomlim =32, rollover to mult
(time (prog nil (polymulttc ones ones))) ;;  31ms toomlim =32, rollover to multk
(time (prog nil (polymulttc ones ones))) ;;  16ms toomlim =20, rollover to multk

(time (prog nil (polymulttc bigs bigs)));;  203ms toomlim=32

(time (prog nil (ppower short 50))) ;;1140msg
(time (prog nil (ppowerk short 50)));;1159ms

;;declarations affect the times. Consider the bignum vs fixnum times, in
;; a later comment when we recompile polymult.


|#

;;; The Toom-Cook r=2 version, 
(defun evpnum(a b c)
  ;; return a list of evaluating a*x^2+b*x+c  at x= -2,-1,0,1,2
  ;; a,b,c are numbers.
  (let* ((w  (+ a c))
	 (k2 (+ w b))
	 (k  (+ k2 (* 3 a) b)))
    (list (- k (* 4 b)) (- w b) c k2 k)))
;; example (evpnum 3 2 1) --> (9 2 1 6 17)

(defun evp(a b c)
  ;; return a list of evaluating a*x^2+b*x+c  at x= -2,-1,0,1,2
  ;; a,b,c are polynomials. Note that the constant 3 as a polynomial is
  ;; the vector #(3).  To construct it once at read-time, use #.#(3).
  
  (let* ((w  (polyadd a c))
	 (k2 (polyadd w b))
	 (k  (polyadd k2 (polyadd (polymult #.#(3) a) b))) )
    (list (polyadd k (polymult #.#(-4) b))
	  (polysub w b)
	  c k2 k)))

;; example (evp #(3) #(2) #(1)) --> (#(9) #(2) #(1) #(6) #(17))
(defvar *toomlim* 300)
#+ignore
(defun polymulttc(pp qq);; by T-C
  (assert (arrayp pp))
  (assert (arrayp qq))
  (let* ((plim (length pp)) 
	 (qlim (length qq)))
    (declare (fixnum qlim plim)(type (array t) aa pp qq))
    (if (< (min plim qlim) *toomlim*)
	(polymultk pp qq)		; don't use for small polys, use karat
      (let* ((ans (make-array (+ -1 plim qlim) :initial-element 0))
	     (h (round (max plim qlim) 3))
	     (i1 (min (* 2 h) plim))
	     (C1 (subseq pp  i1 plim))
     	     (i0 (min h plim))
	     (A1 (subseq pp 0 i0))
	     (B1 (subseq pp i0 i1))
	     (C2 (subseq qq i1 qlim))
	     (A2 (subseq qq 0 i0))
	     (B2 (subseq qq i0 i1))
	     ;; the following could be done by multiple value returns, in-line code..
	     ;; if it seems to be slower than necessary.
	     (p1 (evp A1 B1 C1)) ;; evaluate at points -2, -1, 0, 1, 2
	     (p2 (evp A2 B2 C2))
	     ;; recursively call toom-cook multiplication
	     (q (mapcar #'polymulttc p1 p2))
	     ;; q is list of q(-2)    q(-1)   q(0)     q(1)     q(2)
	     ;;              (elt 0) (elt 1)  (elt 2) (elt 3)  (elt 4)
	     (t1 (polymult #.#(1/2) (polysub (elt q 3) (elt q 1))))
	     (t3 (polysub (polymult #.#(1/2) (polyadd (elt q 3) (elt q 1)))
			  (elt q 2)))
	     (b  (polymult #.#(1/3) 
			   (polysub (polymult #.#(1/4)
					      (polysub (elt q 4) (elt q 0)))
				    t1)))
	     (a  (polymult #.#(1/3) 
			   (polysub
			    (polymult #.#(1/4)
				      (polysub 
				       (polymult #.#(1/2)
						 (polyadd (elt q 4) (elt q 0))) 
				       (elt q 2)))
				    
				    t3))))
			    
	;; this is right, at least in some examples; 
	;(format t "~%  ans=~s, h= ~s, a = ~s, b= ~s, c= ~s, d= ~s, e= ~s" ans h a b (polysub t3 a) (polysub t1 b) (elt q 2))
	(polyadd-into
	 (elt q 2)
	 (polyadd-into 
	  (polysub t1 b)
	  (polyadd-into (polysub t3 a)
			(polyadd-into b 
				      (polyadd-into a ans  0) 
				      h) 
			(* h 2)) 
	  (* h 3))
	 (* h 4))

	))))
;; the example  (setf *toomlim* 3) (polymult #( 1 2 3 4 )#(1 2 3 4))
;; vs  (polymulttc #( 1 2 3 4 )#(1 2 3 4))
	

#|
qmul(a1,b1,c1,a2,b2,c2):= /*quick polynomial multiply */
 /* returns  product coefs [a,...,e] for ax^4+bx^3+cx^2+dx+e */
  block([a,b,t1,t3,
         v1: evp2(a1,b1,c1),  /*5 values of polynomial 1*/
         v2: evp2(a2,b2,c2)], /*5 values of polynomial 2*/
   /* the 5 mults implicit in the next line could be done by recursive
      application of this kind of multiplication */
	[qm2,qm1,q0,q1,q2]:v1*v2,  /* q(-2),q(-1),q(0),q(1),q(2) */
	t1: (q1-qm1)/2,            /* t1 =  b + d */
	t3: (q1+qm1)/2-q0,         /* t3 =  a + c */
	b: (((q2-qm2)/4)-t1)/3,
	a: ((((q2+qm2)/2-q0)/4)-t3)/3,  /* this is just a1*a2 */
	[a,b,t3-a,t1-b,q0])$
	|#


#| a fixnum array version of polymult

;; polymult-fixnum input and output
(defun polymult(p q) ;; by ordinary poly multiplication.
  (declare (optimize (speed 3)(safety 0)) 
	   (type (array  t(*)) p q))
  (assert (arrayp p))
  (assert (arrayp q))
  (let* ((plim (length p)) ;; degree is plim-1
	 (qlim (length q))
	 (ans(make-array (+ plim qlim -1)
			 :element-type 'fixnum 
			 :initial-element 0))
	 (pc 0)	; coef in p
	 (qq (make-fixnum-array q)) ;; copy q so as to make access faster
	 ;(pp (make-fixnum-array p)) ;; copying p is not so important for 
	)
    (declare(fixnum qlim plim pc)
	    (type (simple-array fixnum (*)) ans qq pp))
    (do ((i 0 (1+ i)))
	((= i plim) ans)
      (declare (fixnum i))
      (unless (= 0 (setf pc (aref p i)))
		 ;;(setf pc (aref pp i)))
      (do ((j 0 (1+ j)))
	  ((= j qlim) nil)
	(declare (fixnum j))
	(incf (aref ans (+ i j))
	      (the fixnum (* (the fixnum pc)(the fixnum(aref qq j))))))))))


;; how does this differ from the other? given the data
;; moreones
;;(setf moreones(make-array 10000 :element-type 'fixnum :initial-element 1)) ;10,000

The original polymult takes (on Windsor, 933 Mhz, ACL 7.0)
61.5 seconds
The fixnum only version takes
4.9 seconds.

In more detail, here are timings.
general version

cl-user(186): (time (polymult moreones moreones))
; cpu time (non-gc) 61,479 msec (00:01:01.479) user, 10 msec system
; cpu time (gc)     0 msec user, 0 msec system
; cpu time (total)  61,479 msec (00:01:01.479) user, 10 msec system
; real time  63,131 msec (00:01:03.131)
; space allocation:
;  128 cons cells, 80,008 other bytes, 0 static bytes
#(1 2 3 4 5 6 7 8 9 10 ...)

fixnum only version
 cl-user(175): (time (polymult moreones moreones))
; cpu time (non-gc) 4,867 msec user, 0 msec system
; cpu time (gc)     0 msec user, 0 msec system
; cpu time (total)  4,867 msec user, 0 msec system
; real time  4,927 msec
; space allocation:
;  8 cons cells, 80,104 other bytes, 0 static bytes
#(1 2 3 4 5 6 7 8 9 10 ...)


|#


(defun polymultbyC(c q) ;; constant X poly, in place.
  (assert (integerp c))
  (assert (arrayp q))
  (let ((qlim (length q)))
    (declare(fixnum qlim))
    (do ((i 0 (1+ i)))
	((= i qlim) q)
      (declare (fixnum i))
	(setf (aref q i)
	  (* c(aref q i))))))
;; test

(defun testmult(a b)(format t "~%classic ~s~%toom    ~s~%lim=~s" (polymult a b)(polymulttc a b) *toomlim*))


;; try again to put the pieces together..
;;; this works.  e.g. try
;; (t2 #(1 2 3 4 5 6 7 8 9 10 11) #( 1 2 3))
;; see later copy
#+ignore
(defun polymulttc(pp qq);; by T-C
  (assert (arrayp pp))
  (assert (arrayp qq))
  (let* ((plim (length pp)) 
	 (qlim (length qq)))
    (declare (fixnum qlim plim)(type (array t) aa pp qq))
    (if (< (min plim qlim) *toomlim*)
	(polymultk pp qq)		; don't use for small polys, use karat
      (let* ((ans (make-array (+ -1 plim qlim) :initial-element 0))
	     (h		;(round (max plim qlim) 3)
	      (truncate (max plim qlim) 3)	      )
	     ;; so there should be 
	     ;;   A    B    C        A+B*x^h+C*x^(2h)
	     ;;   h    h    -maybe-lessthan-h
	     
	     (i1 (min (* 2 h) plim))
	     (C1 (subseq pp  i1 plim))
     	     (i0 (min h plim))
	     (A1 (subseq pp 0 i0))
	     (B1 (subseq pp i0 i1))
	     (j1 (min (* 2 h) qlim))
	     (C2 (subseq qq j1 qlim))
	     (j0 (min h qlim))
	     (A2 (subseq qq 0 j0))
	     (B2 (subseq qq j0 j1))
	     ;; the following could be done by multiple value returns, in-line code..
	     ;; if it seems to be slower than necessary.
	     (p1 (evp A1 B1 C1)) ;; evaluate at points -2, -1, 0, 1, 2
	     (p2 (evp A2 B2 C2))
	     ;; recursively call toom-cook multiplication
	     (q (mapcar #'polymulttc p1 p2))
	     ;; q is list of q(-2)    q(-1)   q(0)     q(1)     q(2)
	     ;;              (elt 0) (elt 1)  (elt 2) (elt 3)  (elt 4)
	     (t1 (polymult #.#(1/2) (polysub (elt q 3) (elt q 1))))
	     (t3 (polysub (polymult #.#(1/2) (polyadd (elt q 3) (elt q 1)))
			  (elt q 2)))
	     (b  (polymult #.#(1/3) 
			   (polysub (polymult #.#(1/4)
					      (polysub (elt q 4) (elt q 0)))
				    t1)))
	     (a  (polymult #.#(1/3) 
			   (polysub
			    (polymult #.#(1/4)
				      (polysub 
				       (polymult #.#(1/2)
						 (polyadd (elt q 4) (elt q 0))) 
				       (elt q 2)))
				    
				    t3))))
			    
	;; this is right, when the two inputs are the same length
	
	;;(format t "~% polyparts ~s ~s ~s ~% ~s ~s ~s" A1 B1 C1 A2 B2 C2)
	;(format t "~%  ans=~s, h= ~s, a = ~s, b= ~s, c= ~s, d= ~s, e= ~s" ans h a b (polysub t3 a) (polysub t1 b) (elt q 2))
	(polyadd-into
	 (elt q 2)
	 (polyadd-into 
	  (polysub t1 b)
	  (polyadd-into (polysub t3 a)
			(polyadd-into b 
				      (polyadd-into a ans  0) 
				      h) 
			(* h 2)) 
	  (* h 3))
	 (* h 4))

	))))


  ;; ok, we believe this one works, so now we
  ;; try to speed it up. A few declarations, maybe

(defun polymulttc(pp qq);; by T-C
  (assert (arrayp pp))
  (assert (arrayp qq))
  (let* ((plim (length pp)) 
	 (qlim (length qq)))
    (declare (fixnum qlim plim *toomlim* )(type (array integer (*)) aa pp qq) )
    (if (< (min plim qlim) *toomlim*)
	(polymultk pp qq)		; don't use for small polys, use karat
      (let* ((ans (make-array (+ -1 plim qlim) :initial-element 0))
	     (h		;(round (max plim qlim) 3)
	      (truncate (max plim qlim) 3)	      )
	     ;; so there should be 
	     ;;   A    B    C        A+B*x^h+C*x^(2h)
	     ;;   h    h    -maybe-lessthan-h
	     
	     (i1 (min (* 2 h) plim))
	     (C1 (subseq pp  i1 plim))
     	     (i0 (min h plim))
	     (A1 (subseq pp 0 i0))
	     (B1 (subseq pp i0 i1))
	     (j1 (min (* 2 h) qlim))
	     (C2 (subseq qq j1 qlim))
	     (j0 (min h qlim))
	     (A2 (subseq qq 0 j0))
	     (B2 (subseq qq j0 j1))
	     ;; the following could be done by multiple value returns, in-line code..
	     ;; if it seems to be slower than necessary.
	     (p1 (evp A1 B1 C1)) ;; evaluate at points -2, -1, 0, 1, 2
	     (p2 (evp A2 B2 C2))
	     ;; recursively call toom-cook multiplication
	     (q (mapcar #'polymulttc p1 p2))
	     ;; q is list of q(-2)    q(-1)   q(0)     q(1)     q(2)
	     ;;              (elt 0) (elt 1)  (elt 2) (elt 3)  (elt 4)
	     (t1 (polymult #.#(1/2) (polysub (elt q 3) (elt q 1))))
	     (t3 (polysub (polymult #.#(1/2) (polyadd (elt q 3) (elt q 1)))
			  (elt q 2)))
	     (b  (polymult #.#(1/3) 
			   (polysub (polymult #.#(1/4)
					      (polysub (elt q 4) (elt q 0)))
				    t1)))
	     (a  (polymult #.#(1/3) 
			   (polysub
			    (polymult #.#(1/4)
				      (polysub 
				       (polymult #.#(1/2)
						 (polyadd (elt q 4) (elt q 0))) 
				       (elt q 2)))
				    
			    t3))))
	(declare (fixnum h i0 i1 j0 j1))
			    
	;; this is now right, even if the two inputs are not the same length
	;;(format t "~% polyparts ~s ~s ~s ~% ~s ~s ~s" A1 B1 C1 A2 B2 C2)
	;;(format t "~%  ans=~s, h= ~s, a = ~s, b= ~s, c= ~s, d= ~s, e= ~s" 
	;;         ans h a b (polysub t3 a) (polysub t1 b) (elt q 2))
	(polyadd-into
	 (elt q 2)
	 (polyadd-into 
	  (polysub t1 b)
	  (polyadd-into (polysub t3 a)
			(polyadd-into b 
				      (polyadd-into a ans  0) 
				      h) 
			(* h 2)) 
	  (* h 3))
	 (* h 4))

	))))



(defun polymulttc2(pp qq);; by T-C
  (assert (arrayp pp))
  (assert (arrayp qq))
  (let* ((plim (length pp)) 
	 (qlim (length qq)))
    (declare (fixnum qlim plim *toomlim* )(type (array integer (*)) aa pp qq) )
    (if (< (min plim qlim) *toomlim*)
	(polymultk2 pp qq)		; don't use for small polys, use karat
      (let* ((ans (make-array (+ -1 plim qlim) :initial-element 0))
	     (h		;(round (max plim qlim) 3)
	      (truncate (max plim qlim) 3))
	     ;; so there should be 
	     ;;   A    B    C        A+B*x^h+C*x^(2h)
	     ;;   h    h    -maybe-lessthan-h
	     
	     (i1 (min (* 2 h) plim))
	     ;;(C1 (subseq pp  i1 plim))
	     (C1 (make-array (- plim i1) :displaced-to pp :displaced-index-offset i1))
     	     (i0 (min h plim))
	    ;; (A1 (subseq pp 0 i0))
	     (A1 (make-array  i0 :displaced-to pp :displaced-index-offset 0))
	     ;; (B1 (subseq pp i0 i1))
	     (B1 (make-array  (- i1 i0) :displaced-to pp :displaced-index-offset i0))
	     (j1 (min (* 2 h) qlim))
	     ;;(C2 (subseq qq j1 qlim))
	     (C2 (make-array  (- qlim j1) :displaced-to qq :displaced-index-offset j1))
	     (j0 (min h qlim))
	  ;;   (A2 (subseq qq 0 j0))
	     (A2 (make-array  j0 :displaced-to qq :displaced-index-offset 0))
	     ;;  (B2 (subseq qq j0 j1))
	     (B2 (make-array  (- j1 j0) :displaced-to qq :displaced-index-offset j0))
	     ;; the following could be done by multiple value returns, in-line code..
	     ;; if it seems to be slower than necessary.
	     (p1 (evp A1 B1 C1)) ;; evaluate at points -2, -1, 0, 1, 2
	     (p2 (evp A2 B2 C2))
	     ;; recursively call toom-cook multiplication
	     (q (mapcar #'polymulttc2 p1 p2))
	     ;; q is list of q(-2)    q(-1)   q(0)     q(1)     q(2)
	     ;;              (elt 0) (elt 1)  (elt 2) (elt 3)  (elt 4)
	     (t1 (polymult #.#(1/2) (polysub (elt q 3) (elt q 1))))
	     (t3 (polysub (polymult #.#(1/2) (polyadd (elt q 3) (elt q 1)))
			  (elt q 2)))
	     (b  (polymult #.#(1/3) 
			   (polysub (polymult #.#(1/4)
					      (polysub (elt q 4) (elt q 0)))
				    t1)))
	     (a  (polymult #.#(1/3) 
			   (polysub
			    (polymult #.#(1/4)
				      (polysub 
				       (polymult #.#(1/2)
						 (polyadd (elt q 4) (elt q 0))) 
				       (elt q 2)))
				    
			    t3))))
	(declare (type (array integer (*)) ans)
		 (fixnum h i0 i1 j0 j1))
			    
	;; this is now right, even if the two inputs are not the same length
	;;(format t "~% polyparts ~s ~s ~s ~% ~s ~s ~s" A1 B1 C1 A2 B2 C2)
	;;(format t "~%  ans=~s, h= ~s, a = ~s, b= ~s, c= ~s, d= ~s, e= ~s" 
	;;         ans h a b (polysub t3 a) (polysub t1 b) (elt q 2))
	(polyadd-into
	 (elt q 2)
	 (polyadd-into 
	  (polysub t1 b)
	  (polyadd-into (polysub t3 a)
			(polyadd-into b 
				      (polyadd-into a ans  0) 
				      h) 
			(* h 2)) 
	  (* h 3))
	 (* h 4))

	))))


#| test results on windsome, 933 mhz, acl6.2
*toomlim* is 300
cl-user(65): (time (prog nil (polymult moreones moreones))) 

; cpu time (non-gc) 50,032 msec user, 10 msec system
; cpu time (gc)     0 msec user, 0 msec system
; cpu time (total)  50,032 msec user, 10 msec system
; real time  51,054 msec
; space allocation:
;  10 cons cells, 81,080 other bytes, 0 static bytes
nil
cl-user(66): (time (prog nil (polymultk moreones moreones)));;  656ms

; cpu time (non-gc) 4,246 msec user, 0 msec system
; cpu time (gc)     290 msec user, 0 msec system
; cpu time (total)  4,536 msec user, 0 msec system
; real time  4,687 msec
; space allocation:
;  88,582 cons cells, 30,910,824 other bytes, 0 static bytes
nil
cl-user(67): (time (prog nil (polymulttc moreones moreones)))
; cpu time (non-gc) 2,905 msec user, 0 msec system
; cpu time (gc)     170 msec user, 0 msec system
; cpu time (total)  3,075 msec user, 0 msec system
; real time  3,134 msec
; space allocation:
;  63,907 cons cells, 24,551,488 other bytes, 0 static bytes
nil

with *toomlim* 300, *karatlim* 12, we get
total time 2.9msec

um,, here's the FFT , just using double precision,
fft copied from numerical recipes in lisp (broughan  nr12.lisp)

cl-user(120):  (time (polymultfft moreones moreones))
; cpu time (non-gc) 440 msec user, 0 msec system
; cpu time (gc)     100 msec user, 0 msec system
; cpu time (total)  540 msec user, 0 msec system
; real time  560 msec
; space allocation:
;  3 cons cells, 8,781,296 other bytes, 0 static bytes
#(1 2 3 4 5 6 7 8 9 10 ...)

If we eliminate declarations, doing "closed" arithmetic as would
be necessary if there were potential bignums in the arrays, the
cost looks like this..
[1] cl-user(68):  (time (polymultfftx moreones moreones))
; cpu time (non-gc) 4,185 msec user, 20 msec system
; cpu time (gc)     7,612 msec user, 0 msec system
; cpu time (total)  11,797 msec user, 20 msec system
; real time  12,738 msec
; space allocation:
;  28 cons cells, 262,538,288 other bytes, 0 static bytes
#(1 2 3 4 5 6 7 8 9 10 ...)

and if we change the constants in the four1 program from double
to single, it looks like this..

[1] cl-user(67):  (time (polymultfftxs moreones moreones))
; cpu time (non-gc) 3,652 msec user, 10 msec system
; cpu time (gc)     4,770 msec user, 81 msec system
; cpu time (total)  8,422 msec user, 91 msec system
; real time  9,714 msec
; space allocation:
;  1,052 cons cells, 158,022,176 other bytes, 0 static bytes
#(1 2 3 4 5 6 7 8 9 10 ...)



cl-user(121): 


cl-user(68): |#


;;um, here's the n^2 version, using just doubles in declared arrays

(defparameter dmoreones (make-array 10000 :element-type 'double-float
			    :initial-element 1.0d0))
(defparameter dsome (make-array 10 :element-type 'double-float
			    :initial-element 1.0d0))


;; try to be faster..
(defun polymultd(p q);; by ordinary poly multiplication. double precision
  (assert (arrayp p))
  (assert (arrayp q))
  (let* ((plim (length p));; degree is plim-1
	 (qlim (length q))
	 (p_i 0.0d0)
	 (ind 0)
	 (ans(make-array (+ plim qlim -1) 
			 :element-type 'double-float 
			 :initial-element 0.0d0)))
    (declare(optimize (speed 3)(safety 0))
	    (fixnum qlim plim ind)
	    (double-float p_i)
	    (type (simple-array double-float (*)) p q ans))
    (do ((i 0 (1+ i)))
	((= i plim) ans)
      (declare (fixnum i))
      (setf p_i (aref p i))
      (do ((j 0 (1+ j)))
	  ((= j qlim) nil)
	(declare (fixnum j) )
	(setf ind (+ i j))
	(setf (aref ans ind)
	      (the double-float 
		(+ (the double-float (aref ans ind))
		   (the double-float 
		     (*  (the double-float p_i)
			 (the double-float (aref q j)))))))))))


;; yet another version, based on qmul4
#|
/* still 6 mults */

qmul4(a1,b1,c1,a2,b2,c2):=   /* take 6 coefficients as input */
 /* two quadratics, each a*x^2+b*x+c */
block([],t1:a1*a2,
       t2:b1*b2,
       t3:c1*c2,
       t4:ratexpand((a1+b1)*(a2+b2)),
       t5:ratexpand((a1+c1)*(a2+c2)),
       t6:ratexpand((b1+c1)*(b2+c2)),  /* leave t1 -- t6 as global vars */
/* returns the coefficients of the product */
[t1, (t4-t1)-t2, 
(t5-t3)+(t2-t1),
 (t6-t2)-t3, t3])$
|#
(defvar *q4lim* 32)

;; there is a bug here.  IT works for #(1 2 3) #(1 2 3) but not moreones.
(defun polymultq4(pp qq);;
  (assert (arrayp pp))
  (assert (arrayp qq))
  (let* ((plim (length pp)) 
	 (qlim (length qq)))
    (declare (fixnum qlim plim)(type (array t) aa pp qq))
    (if (< (min plim qlim) *q4lim*)
	(polymultk pp qq)		; don't use for small polys, use class?
      (let* ((ans (make-array (+ -1 plim qlim) :initial-element 0))
	     (h		;(round (max plim qlim) 3)
	      (truncate (max plim qlim) 3)	      )
	     ;; so there should be 
	     ;;   C    B    A        C+B*x^h+A*x^(2h)
	     ;;   h    h    -maybe-lessthan-h
	     
	     (i1 (min (* 2 h) plim))
	     (A1 (subseq pp  i1 plim))
     	     (i0 (min h plim))
	     (C1 (subseq pp 0 i0))
	     (B1 (subseq pp i0 i1))
	     (j1 (min (* 2 h) qlim))
	     (A2 (subseq qq j1 qlim))
	     (j0 (min h qlim))
	     (C2 (subseq qq 0 j0))
	     (B2 (subseq qq j0 j1))
	     (t1 (polymultq4 A1 A2))
	     (t2 (polymultq4 B1 B2))
	     (t3 (polymultq4 C1 C2))
	     (t4 (polymultq4 (polyadd A1 B1)(polyadd A2 B2)))
	     (t5 (polymultq4 (polyadd A1 C1)(polyadd A2 C2)))
	     (t6 (polymultq4 (polyadd B1 C1)(polyadd B2 C2))))
	(declare (fixnum h i0 i1 j0 j1))	
	;;(format t "~s ~s ~s ~%~s ~s ~s~%ts ~s ~s ~s" A1 B1 C1 A2 B2 C2 t1 t2 t3)


	(polyadd-into t1  ;t1
		      (polyadd-into
		       (polysub (polysub t4 t1)t2)  ;t4-t1-t2
		       (polyadd-into 
			(polyadd (polysub t5 t3) (polysub t2 t1))
			(polyadd-into (polysub (polysub t6 t2) t3)
				      (polyadd-into  t3  ans
						     0) 
				      h) 
			(* h 2)) 
		       (* h 3))
		      (* h 4))	))))