(eval-when (compile load) (declaim (optimize (speed 3)(safety 0))))
(load "fftwdsimple.fasl")
;; polynomial multiplication by FFT:  pack 2 input arrays into one array
;; for a complex fft.

;;; BROKEN

(defvar *fftbufsize* 8)
(defvar *fftbuf* (make-array *fftbufsize* :element-type 'double-float 
			     :allocation :static-reclaimable 
			     ))
(defvar *fftbuf2*(make-array *fftbufsize* :element-type 'double-float 
			     :allocation :static-reclaimable 
			     ))

(defun fftpoly2(poly1 poly2 len)
  ;; poly1, poly2 inputs are pairs of arrays:
  ;; of exponents in increasing order,
  ;; and of coefs corresponding to those exponents.
  ;; len is desired length of transform, which is at least as large as degree of poly1,2
  ;; but may be rounded up to next power of 2, or some other convenient FFT length
  ;; This program returns a transformed sequence suitable to
  ;; send into untangle to get the
  ;; fourier transforms of poly1 and poly2l of length len
  (declare (fixnum len))
  (let*((exps1 (car poly1))
	(coefs1 (cdr poly1))
	(exps2 (car poly2))
	(coefs2 (cdr poly2))
	(e 0)
	;(c 0.0d0)
	(r *fftbuf*) ;; this is known to be at least 2*len in size big enough 
	;; we could do this in r, for this program..
	;;(ans(make-array (+ len len) :element-type 'double-float :displaced-to r))
	;;(ans *fftbuf*)
	(plan (fftw_plan len r r -1 64))
	)
    (declare (fixnum len e ); (double-float c)
	     (type (array double-float (*)) r) 
	     (optimize (speed 3)(safety 0)))
    ;; array is twice the length of the (real) input to make room for real & imag parts
     ;; copy real/imag parts of input into adjacent array locations and pre-normalize.
    ;; Fill with zeros first.
 
    (loop for i fixnum from 0 to (+ len len -1) do (setf (aref r i) 0.0d0))
    ;; next, copy real numbers from poly1
    (loop for i fixnum from 0 to (1- (length exps1)) do
	  (setf e (aref exps1 i))  ;; assume coefs are already double-floats ...
	  (setf (aref r (+ e e))  (float (aref coefs1 i) 1.0d0)
	    ))
    (loop for i fixnum from 0 to (1- (length exps2)) do
	  (setf e (aref exps2 i))
	  (setf (aref r (+ 1 e e)) (float (aref coefs2 i) 1.0d0)
		))
    ;;  (if *debug*   (format t "~%len=~s" len))
    ;;(if *debug*     (format t "~%r=~s" (loop for i from 0 to len collect (aref r i))))
    ;; we can do the fft plan here since estimate only will be cheap and not clobber r?
    (fftw_execute_dft plan r r)	      ;; do the FFT
    (fftw_destroy_plan plan)
    r ))

;; faster yet?

(defvar *debug* t)

(defun untangleprod2(f NN )  
  ;;; given an array f, of real/imag/real/imag doubles, untangle to a pair of
  ;;; fourier transforms, and multiply them pointwise
  
  ;;; according to http://www.engineeringproductivitytools.com/stuff/T0001/PT10.HTM
  ;;; with these interleaved data r,s.  fft(k,z) is the kth element of 
  ;;; the array of fft coefs. (but we use 2 spots for real/ imag
  
  ;;;  r= fft(k,r) =   (fft(k,z)+conj(fft(N-k,z))/2
  ;;;  s= fft(k,s) =-i*(fft(k,z)-conj(fft(N-k,z))/2
  
  ;;; they leave out what to do when k=0, since fft is defined from 0 to N-1.
  ;;; treat fft(N,z) == fft(0,z).
  
  ;;; so ans[q]=r[q]*s[q] 
  ;;; returned as real/imag real/image etc.

  (declare (type (array double-float (*)) f)
	   (fixnum NN)
	   (optimize (speed 3)(safety 0)))
  ;; (format t "~% NN=~s" NN) 
  ;;  (format t "~%f=~s" f)
;;  (if *debug* (format t "~%unt=~s" (subseq f 0 (* 2 NN))))

  (let*( 
	;;(ans2 *fftbuf*) ;; re-use buffer??
	;;(ans2 (make-array  (+ NN NN) :element-type 'double-float :initial-element 0.0d0))
	(ans2 *fftbuf2*)
	(ar 0.0d0)(ai 0.0d0)(br 0.0d0)(bi 0.0d0)(idx1 0)(idx2 0) 
	(sr 0.0d0)(si 0.0d0)(rr 0.0d0)(ri 0.0d0))
    (declare (type (array double-float (*)) ans2) (fixnum idx1 idx2)
	     (double-float ar ai br bi sr si rr ri))
    (setf (aref ans2 0) (* (aref f 0)(aref f 1)))
    (setf (aref ans2 1) 0.0d0)

    (loop for k fixnum from 1 to  (1- NN) do 
	  (setf idx1 (+ k k))
	  (setf idx2 (* (- NN k) 2))
	  (setf ar (aref f idx1)
		ai (aref f (1+ idx1)))
	  (setf br (aref f idx2)
		bi (aref f (+ idx2 1)))
	    
	  (setf rr  (+ ar br)
		ri  (- ai bi))
	  (setf sr (+ ai bi)
		si (- br ar))
	 ; (format t "r= ~s ~s, s= ~s ~s" rr ri sr si)
	  (setf (aref ans2 idx1)     (* 0.25d0 (- (* rr sr) (* ri si))))
	  (setf (aref ans2 (1+ idx1))(* 0.25d0 (+ (* rr si)(* ri sr)))))

    ;; this all accomplishes complex multiplication of relevant parts.
    ;; pieces unraveled for speed, instead of consing up complex numbers.
;;      (if *debug* (format t "~%ans2=~s" (subseq ans2 0 (* 2 NN))))
    ans2))

(defun mulfft3(r s)  ;; multiply polynomials r,s using floating FFT
  ;;compute the size Z of the answer. Z is a power of 2.
  ;; compute complex array r, increased to size S
  ;; compute complex array r, increased to size S
  ;; compute two FFTs
  ;; multiply pointwise
  ;; compute inverse FFT  * 1/n
  ;; convert back to array and return answer, coefficients rounded to integers.
  ;; The answer will be only approximate if the coefficients are too large or
  ;; too corrupted by round-off.
  (let* ((lr (length (car r)))
	 (ls (length (car s)))
	 ;; compute deg of r + degree of s +1
	 (lans (+ (aref (car r) (1- lr)) (aref (car s) (1- ls)) 1))
	 (z (ash 1 (ceiling (log lans 2))))) ; round up to power of 2
	 ;;; check the size of fftbuf here.. allocate more if necessary
    (if (< *fftbufsize* (+ z z)) ;; enough for real and imag part 
	(setf *fftbuf* (make-array (setf *fftbufsize* (+ z z) )  ;; real and imag parts
				   :element-type 'double-float
				   :allocation :static-reclaimable
				   )
	      *fftbuf2*  ;; need another one, or could overlap ops in *fftbuf* with some fancier progs.
	      (make-array  *fftbufsize*   ;; real and imag parts
				   :element-type 'double-float
				   :allocation :static-reclaimable
				   )))
    ;; this next line does the whole job.
;;    (format t "~%z=~s" z)
    (fftiR3(untangleprod2 (fftpoly2 r s z) z) z)))

(defun fftiR3(input len) ;; this is the inverse FFT, retaining only REAL part of answer
  ;; input is array of real, imag, real, imag, ...
  ;; returns array of REAL parts of answer, double-float.

  ;; computes a plan, if there is not one.
  (declare   (type (array double-float (*)) input))
  (let*(;(len (ash (length input) -1))
	(r input)
	(anse (make-array  len :element-type 'fixnum :fill-pointer 0))
	(ansc (make-array  len  :fill-pointer 0))
	(q  len)
	(plan (fftw_plan len r r 1 64))
	(k 0))

    (declare (fixnum len k q)  
	     (type (array double-float (*)) ansc)
	     (type (array fixnum (*)) anse)
	     (optimize (speed 3)(safety 0)))
    (fftw_execute_dft plan  r r)		  ;; do the FFT
;    (if *debug* (print "!"))
    (fftw_destroy_plan plan)
    
    (loop for i fixnum from 0 to (1- (* 2 len)) by 2 do ;; copy real parts into answer
	    (setf k (round (aref r i) q))
	  ;(setf k (aref r i))
	;;  (format t "~% r[i]=~s k=~s " (aref r i) k)
	  (cond ((= k 0) nil)
	       (t  (vector-push (ash i -1) anse)
		   (vector-push k ansc))))
    (cons anse ansc)))


;;maxima version of pieces of this..
#|
;; normalization is 1, 1/n

(slowFFT(x):= block([n:length(x)],
               map(lambda([k], sum(x[j]*exp((-2*%i*%pi*(j-1)*k)/n),j,1,n)),
	       makelist(i,i,0,n-1))),		       

slowFFTi(x):= block([n:length(x)],
               map(lambda([k], 1/n*sum(x[j]*exp((2*%i*%pi*(j-1)*k)/n),j,1,n)),
	       makelist(i,i,0,n-1))),		       

untangle(v):=block([n:length(v)], local(a,b,r),
 r[i]:=v[i+1],
 a[0]:realpart(r[0]),
 b[0]:imagpart(r[0]),
 a[i]:=1/2*(r[i]+conjugate(r[n-i])),
 b[i]:= -%i/2*(r[i]-conjugate(r[n-i])),

  [makelist(a[i],i,0,n-1),
   makelist(b[i],i,0,n-1)]))
 |#

 #| 
(defvar xv (cons (coerce  (loop for i from 0 to 1999 collect i) 'vector)
		 (coerce  (loop for i from 0 to 1999 collect 1) 'vector)))
(defvar xd (cons (coerce  (loop for i from 0 to 1999 collect i) 'vector)
		 (coerce  (loop for i from 0 to 1999 collect 1.0d0) 'vector)))

(defvar xs (cons (coerce  (loop for i from 0 to 511 collect i) 'vector)
		 (coerce  (loop for i from 0 to 511 collect 1.0d0) 'vector)))

(defvar xss (cons (coerce  (loop for i from 0 to 511 by 2 collect i) 'vector) ;sparse
		  (coerce  (loop for i from 0 to 511  by 2 collect 1.0d0) 'vector)))

(defvar xt2 (cons (coerce  (loop for i from 0 to 511 by 8 collect i) 'vector) ;sparse
		  (coerce  (loop for i from 0 to 511  by 8 collect 1.0d0) 'vector)))

(defvar xt16 (cons (coerce  (loop for i from 0 to 511 by 16 collect i) 'vector) ;sparse
		   (coerce  (loop for i from 0 to 511  by 16 collect 1.0d0) 'vector)))

(defvar xt100 (cons (coerce  (loop for i from 0 to 99 collect i) '(array fixnum (*)))
		    (coerce  (loop for i from 0 to 99   collect 1.0d0) '(array double-float (*)))))
|#


;; for comparison purposes we might need a pure fft multiplication routine that
;; assumes as input two double-float dense arrays; exponents implicit.  Here it is




(defun mulfftd(p q)
    (declare(optimize (speed 3)(safety 0))
	    (type (array double-float (*)) p q))
  (assert (arrayp p))
  (assert (arrayp q))
  (let* ((plim (length p));; degree is plim-1
	 (qlim (length q))
	 (lans (+ plim qlim -1))
	 (z (ash 1 (ceiling (log lans 2)))) ;; don't round up to power of 2.
	 ;; fftw can handle any length, and usually saves time.
	 ;;(z lans)
	 (zflo (/ 1.0d0 (float z 1.0d0)))
	 (ppqq (make-array (+ z z) :element-type 'double-float :initial-element 0.0d0))
;;	 (ppqq *fftbuf*)
;;	 (ans (make-array lans :element-type 'double-float)) ;; could use ppqq again..
	 )				; round up to power of 2
    (declare  (fixnum qlim plim) (double-float zflo) )
    (loop for i fixnum from 0 below plim do (setf (aref ppqq (+ i i))
					; (aref p i)
					      (* zflo (aref p i))
					      )) ;; normalize?
    (loop for i fixnum from 0 below qlim do (setf (aref ppqq (+ 1 i i))(aref q i)))
    (fftw_execute_dft (fftw_plan z ppqq  ppqq -1 64) ppqq ppqq)
    (setf ppqq (untangleprod2 ppqq z))
    (fftw_execute_dft (fftw_plan z ppqq  ppqq 1 64) ppqq ppqq)
    
    (loop for i fixnum from 0 below lans do (setf (aref ppqq i)(aref ppqq (+ i i))))
    (subseq ppqq 0 lans)
    ;;ppqq
	 ))




(defvar x0 

 (cons #(2 6 8 10 12 14 16 17 21 23 27 31 34 38 41 45 47 51 52 53 56 60 64 66
   67 70 72 74 78 79)
  #(930 19 942 816 510 1005 397 1007 279 503 969 449 173 268 820 949
     136 60 251 774 712 289 875 722 937 786 1015 526 157 471)))