;;; attempts by RJF to make OCT work fast in Allegro, based on 
;;; translating RTOY's code to macros, and converting code from RJF's QD to OCT.
;;; see comments in file octi.lisp too
;;;  certainly until I include all of RTOY's stuff for sin/cos/.
;;;  Still, my qd stuff has quadrature, fft, polynomial eval.

;;; 10/18/07 RJF

(defpackage :oct
  (:use :ga  :octi :cl)
    (:shadowing-import-from  
   :ga    
   "+" "-" "/" "*" "expt"		;... n-ary arith
   "=" "/=" ">" "<" "<=" ">="		;... n-ary comparisons
   "1-" "1+" "abs" "incf" "decf"
   "min" "max"
   "ftruncate" "ffloor" "fround" "fceiling" 
   sin cos tan 
   asin acos 
   atan log   ;; these are trickier, 1 or 2 args.
   exp
   sinh cosh tanh 
   asinh acosh atanh 
   sqrt 
   tocl 
   numerator denominator
   expt 
   rational  ;; convert to rational, exactly
   )
    (:export into)
    )

(in-package :oct)   ;;; NOT OCTI


(eval-when (:execute :load-toplevel :compile-toplevel :execute)
  
(defun into(x &optional (targ 0 targp)) ;if a target supplied, store into it
    (if targp (progn (octi::into x (oct-real targ))targ)
      (make-oct :real (octi::into x))))



(defstruct oct (real  #+allegro
		      (make-array 4 :element-type 'double-float 
			       :initial-element 0.0d0 
			       ;; next line is Allegro-specific
			       :allocation :lispstatic-reclaimable
			       )
		      #-allegro
		      (make-array 4 :element-type 'double-float 
			       :initial-element 0.0d0)

		      :type (simple-array double-float (4)))))

(defmethod make-load-form ((a oct)&optional environment)
  (make-load-form-saving-slots a :environment environment)

;;(defmethod make-oct-d ((a array))  ;; make an object from the array
;;  (make-instance 'oct :real a))
)
(defmacro copy_mac(a);;make a fresh copy of oct a.
  `(make-oct :real
   (make-array 4 :element-type 
			    'double-float 
			    :initial-contents (oct-real ,a)
			    :allocation :lispstatic-reclaimable)))

(defun copy_into(op1 op2) ;fill op2's memory with op1's value.
  (octi::copy-octi-into-octi (oct-real op1)(oct-real op2)) op2)

(defmacro defarithmetic (op pgm)
    (let ((two-arg
	   (intern (concatenate 'string "two-arg-" (symbol-name op))
		   :ga))
	  (oo-entry ;; two oct args
	   (intern (concatenate 'string (symbol-name pgm) "-oct-t")
		   :octi))
	  ;;(od-entry ;; oct and double args
	  ;; (intern (concatenate 'string (symbol-name pgm) "-oct-d-t") :octi))
	  )
	
 ;;           (format t "~% defining ~s" two-arg)
      `(progn
	 
	 ;; new defmethods for oct. ..
	 (defmethod ,two-arg ((arg1 oct) (arg2 oct))
	   (let* ((r (oct::make-oct))
		  (in (oct::oct-real r))
		  (a1 (oct::oct-real arg1))
		  (a2 (oct::oct-real arg2)))
	     (declare (optimize speed)
		      (type(simple-array double-float (4)) in a1 a2))
	     (,oo-entry a1 a2 in) r))
       
	 (defmethod ,two-arg((arg1 real) (arg2 oct))
	  #+ignore (,two-arg (into arg1) arg2)
	   (let* ((r (oct::make-oct :real (octi::into arg1)))
		  (in (oct::oct-real r))
		  (a2 (oct::oct-real arg2)))
	     (declare (optimize speed)
		      (type(simple-array double-float (4)) in a1 a2))
	     (,oo-entry in a2 in) r)
	   )
       
	 (defmethod ,two-arg ((arg1 oct) (arg2 real))
	  #+ignore    (,two-arg arg1 (into arg2))

		   (let* ((r (oct::make-oct :real (octi::into arg2)))
		  (in (oct::oct-real r))
		  (a1 (oct::oct-real arg1) ))
	     (declare (optimize speed)
		      (type(simple-array double-float (4)) in a1 a2))
	   
	     (,oo-entry a1 in in) r))
	 (setf (get ',op 'argnum) 2) ;used by with-temps, dsetv
	 (setf (get ',op 'oct-program) ',oo-entry) ;used by with-temps, dsetv
	 (setf (get ',two-arg 'oct-program) ',oo-entry) ;used after macroexpand-all
	 (setf (get ',two-arg 'argnum) 2)
	 
	 )))


(defarithmetic + add)
(defarithmetic * mul)
(defarithmetic - sub)
(defarithmetic / div)

;; need to do other elementary functions etc as well.
;; see lisp/generic/qd and  lisp/oct/qd-fun..

;; this special case doesn't fit into defarithmetic macro.
(defmethod ga::two-arg-expt ((base oct)(n integer))
  (let ((ans (make-oct)))
    (octi::pow-oct-i-t (oct-real base) n (oct-real ans))
    ans))


       
(defmethod print-object ((a oct) stream)(format stream "~a" 
						(octi::oct2string (oct::oct-real a))))

(defmethod octi::lisp2oct((s string)(ans array))(string2oct s))

(defmethod octi::lisp2oct((a oct) (ans array)) (octi::copy-octi-into-octi (oct-real a) ans ))

(defun oct-reader(stream subchar arg)
  (declare (ignore subchar arg))
  (let ((s (read stream)))

    (string2oct (format nil "~s" s))))

(defun string2oct(s);; read integerQinteger like 10Q2   = 100 = 0.1Q3  or  -3.1q-1
  ;; examples of acceptable numbers [put " " around them]
  ;; .1q0 ; 1Q0; 1.2q3;  -1.23q-4;  - .23 q -4;  2.3q+4; +4.5q6; q0; --3Q1 same as 3q1
  ;; 3 ; 3.4; 123.45q+100; -.q0 ; .q0 ;
  ;; examples of not acceptable numbers:
  ;;    q ; .q;   123.45q+100  [=NaN(qd)]
  (let ((p0 0)(sign 1))
    (setf s (delete #\  s));; remove leading or other spaces from string.
    (if (char= #\- (aref s 0))(setf p0 1 sign -1));; set sign if negative
    (multiple-value-bind (frac pos);; read fraction to left of .
	(parse-integer s :start p0 :radix 10 :junk-allowed t)
  ;;    (format t "~% frac= ~s pos=~s" frac pos)
      (if (null frac)(setf frac 0));; empty fraction is zero
      (if (= pos (length s)) (* sign (into frac))
	(case (aref s pos);; look at next char
	  ((#\Q #\q);; 10Q2
	   (multiple-value-bind (expon pos2)
	       (parse-integer s :start (1+ pos);skip the Q
			      :radix 10)
	;;     (format t "~% sign=~s frac=~s expon=~s" sign frac expon)
	     (* sign (into frac) (expt 10 expon))))
	  (#\.;; 
	   (multiple-value-bind (frac2 pos2)
	       (parse-integer s :start (1+ pos);skip the "."
			      :radix 10 :junk-allowed t )
	     (if (null frac2)(setf frac2 0))
	     (setf frac
		   (+ (into frac)
		      (* (if (< frac 0) -1 1) frac2 (cl::expt 10 (cl::1+(cl::- pos pos2))))))
	      ;; (format t "~% string pos ~s frac= ~s" pos2 frac)
	     (if (cl::= pos2 (length s))(* sign (into frac))
	       (case (aref s pos2)
		 ((#\Q #\q);; 10Q2
		  (multiple-value-bind (expon pos3)
		      (parse-integer s :start (1+ pos2);skip the Q
				     :radix 10 )
		    (* sign  frac (cl::expt 10 expon))
		    ))
		 (otherwise 
		  (format t "next char is ~a -- Not an oct spec: ~s"  
			  (aref s pos2) s)
		  (into (or frac 0)))))))
	  )))))



;; reading OCT numbers can be done this way...
(set-dispatch-macro-character #\# #\q #'oct::oct-reader) 

(defun time-mul (n)
  (declare (fixnum n)
	   (optimize (speed 3) (safety 1) (debug 0)))
  (let* ((h (into 1/3)))
    (print h)
    (time (dotimes (k n)
	    (declare (fixnum k))
	    (* h h)))))

(defun time-poly (n)
  (declare (fixnum n)
	   (optimize (speed 3) (safety 1) (debug 0)))
  (let* ((h (list (into 5)(into 2)(into 1)))
	 (arg (into 100)))
    (print h)
    (time (dotimes (k n)
	    (declare (fixnum k))
	    (polyeval h arg)))))
(defun time-polyd (n)
  (declare (fixnum n)
	   (optimize (speed 3) (safety 1) (debug 0)))
  (let* ((h (list 5d0 2d0 1d0))
	 (arg 100d0))
    (print h)
    (time (dotimes (k n)
	    (declare (fixnum k))
	    (polyevald h arg)))))



;;Compiler for  oct data type
;; RJF
(in-package :oct)

;;; We want to make better use of the state-based programs like
;;; mul-oct-t Assuming octs.. for a, b, and c: (dsetv a (+ b c))
;;; destroys the value in a.  Compare this to (setf a (+ b c)) which
;;; creates a new value and points a to it.

;; dsetv,  data driven
(defmacro dsetv (targ ex)
  ;; try  (dsetv a (+ b c)) 
  ;; should be faster than (setf a (+ b c)). maybe 2X.
  ;; All the logic below is done during macro-expansion,
  ;; which means it is usually done at compile time. Run time
  ;; is therefore not penalized.  If you use dsetv from an interpreted
  ;; program it will be slow, however, because it will do the macro
  ;; expansion followed by the execution, each time it is used.
  (setf ex (macroexpand ex))  
  (cond 
   ((atom ex) `(into ,ex ,targ))
   ((eq (car ex) 'into) `(into ,@(cdr ex)  ,targ))
   ((eq (car ex) 'setq) 
    (let ((gg (gensym))) ;; need to protect against capturing z in (setq z ..))
    `(let ((,gg  ,(with-temps (caddr ex))))
       (copy_into  (oct-real ,gg) (oct-real ,(cadr ex)))
       ,gg)))
   (t 
    (let* ((op (car ex))
	   (args (cdr ex))
	   (the-op (get op 'oct-program))
	   (argnum (get op 'argnum)))
      (cond 	       
       ((not the-op);; not a previously listed op
	`
	   (let* ((lval ,targ)
		  (a1 (oct-real  (,op ,@ args)))
		  (tt (oct-real lval)))
	     (declare (optimize speed)
		      (type (simple-array double-float (4)) a1 tt))
	     (copy_into a1 tt)
	     lval))
       ((not (eql argnum (length args))) 
	(error "dsetv was given operator ~s which expects ~s args, but was given ~s --  ~s" 
	       op argnum (length args) args))
       (t
	(case argnum
	  (1;; one argument.
	   `(let ((a1 (oct-real ,(macroexpand `(with-temps ,(car args)))))
		    (tt (oct-real ,targ)))
		(declare (optimize speed)(type (simple-array double-float (4)) a1 tt))
		;; could also check other args for being type qd
		;; could also allow for args to be si, ui, dd, etc.
		;; could also check number of args to be appropriate for operation
		(,the-op a1 tt)
	      ,targ))
	  (2
	   `(let ((a1 (oct-real ,(macroexpand `(with-temps ,(car args)))))
		  (a2 (oct-real ,(macroexpand `(with-temps ,(cadr args)))))
		    (tt (oct-real ,targ)))
		(declare (optimize speed)(type (simple-array double-float (4)) a1 a2 tt))
		(,the-op a1 a2 tt)
		,targ
		))
	  (otherwise (error "argnum is wrong for op ~s " op))
	  )))))))

;;;;;;;;;;;;;;;;;;more efficiency hackery follows.;;;;;;;;;;;;;;;;


;; We just allocate a few private "registers" say, for a
;; function, or an inner loop, and re-use them, if we are in a
;; loop. No need to tell anyone else about a few temp locations,
;; especially if they are GC'd when truly inaccessible.  That's what
;; is below.

(defmacro with-temps(expr)
  (let ((*names* nil)
	(*howmany* 0))
    (labels ((genlist(n)(loop for i from 1 to n collect (into i))) ;make a list of fresh qd items
	     (ct1 (r) ;; count temporaries needed
	       (cond ((numberp r) (incf *howmany*))
		     ((not (consp r)) r)
		     (t (incf *howmany*)
			(mapc #'ct1 (cdr r)))))
		
	   (maketemps(r) ;change r=(+ a (* b c)) to  temp storage .
		     (cond ((numberp r) (into r))
			   ((atom r) r)
			   ((get (car r) 'argnum); known operator
			    `(dsetv ,(pop *names*)
				    ,(cons (car r)(mapcar #'maketemps (cdr r)))))
			   ;; just a symbol name? maybe aref? better be the right type, aqd.
			   (t  r))))
      (setf expr (macroexpand expr))
       (ct1 expr)
     ;; (ct1 expr); count the temporaries
    (setf *names* (genlist *howmany*))
    (maketemps expr))))


;;  try (pprint (macroexpand '(with-temps (+ x (* 3 z)))))
;; or  (defun hypot(x y)(copy_mac (with-temps (sqrt (+ (* x x)(* y y))))))
;; need the call to copy to make a  copy of the result before calling hypot again. 

;; set up the environment for with-temps and dsetv here

(eval-when (compile load eval)
  (mapc #'(lambda(h) (setf (get h 'argnum) 2)) '(atan2 log2 setq))
;; for now assume they are given only one arg and if they are given 2 signal an error with dsetv.
  (mapc #'(lambda(h) (setf (get h 'argnum) 1)) '(atan log))
;;  (defun qd_setq (a b)(qd_copy_into b a) b)
 )

;; time trial

(defun time-mul-d (n)
  (declare (fixnum n)
	   (optimize (speed 3) (safety 1) (debug 0)))
  (let* ((h (into 1/3))
	 (ans (into 0)))
    (print h)
    (time (dotimes (k n)
	    (declare (fixnum k))
	    (dsetv ans (* h h))))))


;;;;;;;;;;;::::There's a lot more in generic/qd.lisp
;;;;;;;;;not yet converted to OCT.

(defmacro defcomparison (op)
    (let ((two-arg-ga (intern (concatenate 'string "two-arg-" 
					   (symbol-name op)) :ga))
	  (two-arg-octi (intern (concatenate 'string "two-arg-" 
					(symbol-name op)) :octi))
	  )
    ;;  (format t "~% defining ~s" two-arg)
      `(progn
	 ;; only extra methods not in ga are defined here.
	 ;; qd_comp returns -1 0 1   for < = >
	 (defmethod ,two-arg-ga ((arg1 oct) (arg2 oct))   
	   (,two-arg-octi (oct-real arg1)(oct-real arg2)))

	 (defmethod ,two-arg-ga ((arg1 real) (arg2 oct))
	   (let ((arg1 (into arg1)))
	     (,two-arg-octi (oct-real arg1)(oct-real arg2))))
	 
	 (defmethod ,two-arg-ga ((arg1 oct) (arg2 real))
	   (let ((arg2 (into arg2)))
	     (,two-arg-octi (oct-real arg1)(oct-real arg2))))
	 ',op)))

(defcomparison >)
(defcomparison <)
(defcomparison =)
(defcomparison /=)
(defcomparison >=)
(defcomparison <=)

;; allows (< (into 1)(into 2)(into 3)) 



;;How about the single-arg functions like sin cos log...

(defmacro r (op);;
  (let ((fun-name  (intern op :ga ))
	(octi-name (intern (concatenate 'string op "-oct-t") :octi)))

    `(progn
       (defmethod ,fun-name ((arg oct))
	 (let* ((h (make-oct)) (in (oct-real h)))
	   (declare (optimize speed)
		    (type (simple-array double-float (4)) in))
	   (,octi-name (oct-real arg) in)
	   h))
       
       (setf (get ',fun-name 'argnum) 1)
       (setf (get ',fun-name 'oct-program) ',octi-name)
       ',fun-name
       )))


(r "sqrt") (r "abs")
(r "sin")(r "cos")

;;(r "log")(r "exp") ;; etc
(r "1-")
(r "1+")
(r "exp")


(defmethod ga::one-arg-atan((arg oct))
           (let* ((h (make-oct)) (in (oct-real h)))
           (declare (optimize speed)
            (type (simple-array double-float (4)) in))
           (octi::atan-oct-t (oct-real arg) in)
           h))

(defun ftruncate ( x &optional (y 1))
  (octi::ftruncate-oct (oct-real x) y))
(defun fceiling (x &optional (y 1)) ;; fceiling for oct only
  (octi::fceiling-oct (oct-real x) y))
(defun fround (x &optional (y 1))
  (octi::fround-oct (oct-real x) y))

;; complex version
;; we do not propose to implement complex numbers at the octi level.


(defstruct octz (real (into 0) oct)
	   (imag (into 0)oct))

;; try these out.

(defmethod ga::two-arg-+ ((a octz)(b octz))
  (let ((ans (make-octz)))
    (setf (octz-real ans) (+ (octz-real a)(octz-real b))
	  (octz-imag ans) (+ (octz-imag a)(octz-imag b)))
    ans))

(defmethod ga::two-arg-* ((a octz)(b octz))
  (let ((ans (make-octz)))
    (setf (octz-real ans) (* (octz-real a)(octz-real b))
	  (octz-imag ans) (* (octz-imag a)(octz-imag b)))
    ans))

(defun lisp2octz(a)
  (let ((ans (make-octz)))
    (setf (octz-real ans) (into (realpart a))
	  (octz-imag ans) (into (imagpart a)))
    ans))

;; we could set up a separate package for complex oct..
;; but then the sqrt(neg) issue becomes complicated.

(defmethod ga::exp ((z octz))
  ;; exp(a+bi)= exp(a)*(cos b + i sin b)
  
  (let* ((ans (make-octz))
	 (a (octz-real z))		;real
	 (b (octz-imag z))		;real
	 (ea (exp a))			;real
	 )
    (setf (octz-real ans)   (* ea(cos b))
	  (octz-imag ans)   (* ea (sin b)))
    ans))

;; re-implement ga::sqrt((z oct))  as well as ga::sqrt((z octz)).


	  
    

  
;;;;;; some fun.
;; need abs, < >
#|

;; A BUNCH OF PROGRAMS PLAYING OCT GAMES and ROOTFINDING.

;; Refining a polynomial root via Newton's method.
;; First, deriv computes the derivative of a polynomial, in a list.
;; recall that we can create a polynomial as a list:
;; (setf testpoly (list 5 3 1)) ;; 5*x^2+3*x+1
;; or as an OCT polynomial
;; (setf testpoly (list (into 5) (into 3) (into 1))) ;; 5*x^2+3*x+1
|#

(defun deriv(coefs)
  ;;given coefs of a polynomial. return coefs of derivative.
  ;; constant coeff is last.  Any kind of numbers are OK.
  (let ((ans nil) (i 0))
    (dolist (c (cdr (reverse coefs)) ans) 
      (push (* (incf i) c) ans))  ans))

(defun polyeval (coeflist x) ;coeflist of octs, x is an oct
      (let ((sum (into 0)))
      (dolist (i coeflist sum)
	(setf sum (with-temps (+ i (* x sum)))))))

;;  a fairly general Newton iteration requiring three parameters, and some optional ones.
;;  f  is a function to evaluate f(x),
;;  df is a function to evaluate f'(x).
;;  x, a starting point,
;; optional: threshold for absolute value of f at which to stop,
;; optional: iters, a maximum number of iterations.

;;  oneroot uses whatever arithmetic is appropriate, based on what f and df use.

(defun oneroot (f df x threshold iters &aux fval)
  (dotimes  (i iters (error "rootfinder failed to converge. Residual is ~s after ~s iterations." 
			    fval i))
    (setf fval (funcall f x))
    (if (< (abs fval) threshold) 
	(return (values x fval i))		;return x, residual and iteration count
      (decf x (/ fval (funcall df x))))))


;; if we replace (decf x (/ fval (funcall df x)))
;; by            (dsetv x (with-temps (- x (/ fval (funcall df x))))))))
;; this would save 2 allocated temporaries per iteration.


;; A carefully declared polynomial eval using
;; Lisp double-floats. I don't see how to make this more optimized.
(defun polyevald (dlist x) 
  (let ((sum 0.0d0))
    (declare (double-float sum x) (optimize (speed 3)(debug 0)))
    (dolist (i dlist sum)
      (declare (double-float i))
      (setf sum (the double-float 
		  (cl::+ i
			 (the double-float 
			   (cl::* x sum)))))))) ;; this is about 130 X faster than the oct version.

;; Next, set up a call to oneroot given a list of coefficients representing
;; a polynomial.  polyrootd makes everything computing in (real) double-float.
;; defaults are provided, so all you need is the list and a starting point.
;; the inputs coefs and x may be given as exact integers, floats, rationals.
;; example  (polyrootd '(1 0 -2) 1); returns 1.4142135623730951d0, i.e. sqrt(2)

(defun polyrootd (coefs x &optional (threshold 1.0d-14) (iters 20))
  (setf coefs (mapcar #'(lambda(z)(coerce z 'double-float))coefs))
    (let ((dp (deriv coefs)))
      (oneroot  #'(lambda(x)(polyevald coefs x))
		#'(lambda(x)(polyevald dp x))
		(* 1.0d0 x)
		(* 1.0d0 threshold)
		iters)))

;; this is the same functionality, but using higher (QD) precision.
;; the inputs coefs and x may be given as exact integers, floats, rationals.
;; (oct::polyroot '(1 0 -2) 1); returns sqrt(2), but if you want more precision:
;; (oct::polyroot '(1 0 -2) 1 1.0d-60)  returns
;;0.141421356237309504880168872420969807856967187537694807317667973796Q1

;;this is called oct:polyroot. You may prefer other ways to do it, 
;;e.g. mpfr::polyroot or some interval method, or some method
;; computing derivatives.

(defun polyroot (coefs x &optional (threshold 1.0d-50) (iters 20))
  (setf coefs (mapcar #'into coefs))
  (let ((dp (deriv coefs)))
    (oneroot  #'(lambda(x)(polyeval coefs x))
	      #'(lambda(x)(polyeval dp x))
	      (into x)
	      (into threshold)
	      iters)))

;; try (polyroot '(1 0 -4) 1 1d-60 20)

;; ONEROOT is considerably more versatile: it can find root of other
;; functions, not just polynomials.
#|
(defun tt (x)(- (cos x) x))  ;; easy near 0.73
(defun ttd(x) (- (+ (sin x) 1))  )
(oneroot #'tt #'ttd (into 0) (into 1.0d-64) 20)  ;; uses oct
(oneroot #'tt #'ttd 0.0d0 1.0d-38 20) ;;uses double-float.

|#
	

;;;;;;;;;;;;;;;;;;;FFT!!!;;;;;;;;;;;;;;;;;

(eval-when (compile load eval)
  
; Fourier Transform Spectral Methods
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;Routines translated with permission by Kevin A. Broughan from ;;;;;;;;;;;
;;Numerical Recipies in Fortran Copyright (c) Numerical Recipies 1986, 1989;;;;
;;;;;;;;;;;;;;;Modified by Ken Olum for Common Lisp, April 1996;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; notes 4/27/05, RJF.  
;; more notes, 2/28/06 RJF. Converting h:/lisp/fft.lisp to OCT arithmetic.
;; slightly rearranged for OCT, 10/29/07  
  
;; see comments at lisp/fft.lisp.  We use four1 only.  The input data
;; will be overwritten by the result.  The inverse fft is indicated by
;; :isign -1, though it must be multiplied by 1/nn to work.
;; functions:
;;	four1: fourier transform (FFT) in one dimension
  
(defparameter *zz* (into 0))
(defparameter *one* (into 1))

(defun v2dfa(a &optional (m (length a))	)
  ;;coerce a vector of oct numbers (or lisp numbers) of length m to a
  ;; oct array of length 2m, since here complex numbers are stored in 2
  ;; adjacent oct locations. Caution: same objects are being used. Do not
  ;; mutate them.
  (let* ((k (length a))
	 (ans (make-array (cl::* 2 m) :allocation :lispstatic-reclaimable ))
	 (zz (copy_mac *zz*));zero
	 (h nil))
    (declare (fixnum k m)(optimize (speed 3)(safety 1)))
    (dotimes (i k)	;just point to the actual numbers, or convert if needed
      (declare (fixnum i))
      (setf (aref ans (cl:* 2 i))
	(if (oct-p (setf h (aref a i))) h (oct:into h))) ;; here we convert.
      ;;(setf (aref ans (cl:1+ (cl:* 2 i))) (copy_mac zz))
      (setf (aref ans (cl:1+ (cl:* 2 i))) zz )); just use zero
    (loop for i fixnum from (* 2 k) to (1-(* 2 m)) do 
	  (setf  (aref ans i) zz)) 
    ans))

;; (v2dfa #(30 40 50) 4)
;;  --> #(0.3Q2 0.Q0 0.4Q2 0.Q0 0.5Q2 0.Q0 0.Q0 0.Q0)


(defmethod ga::outof ((x oct))(oct2lisp x))

(defmethod oct2lisp((x oct))
  (if (excl::nan-p (aref (oct-real x) 0)) (aref (oct-real x) 0)
					; if qd contains a NaN, use it.
      (apply #'cl:+  (map 'list #'rational (oct-real x)))))

(defun dfa2v(a &optional (m (/ (length a)2)))
  ;; Coerce real parts back
  ;; to integers, more or less.  a is an array of even length.  If you
  ;; know that there are trailing zeros, set the actual length with
  ;; the optional second parameter m.
  (let* ((k (/ (length a) 2))
	 (ans (make-array m)))
    (declare (fixnum k m)(optimize (speed 3)(safety 1)))
    (dotimes (i m ans)
      (declare (fixnum i))
      (setf (aref ans i)(round (ga::outof (aref a (cl::* 2 i))) k)))))

;;(dfa2v  (v2dfa #(256 256 256) 4))
;; -->  #(64 64 64 0)  is correct.

;; this works!
(defun polymultfft(r s)
  (declare (optimize (speed 3)))
  ;;compute the size Z of the answer. Z is a power of 2.
  ;; compute complex array r, increased to size S
  ;; compute complex array r, increased to size S
  ;; compute two FFTs
  ;; multiply pointwise
  ;; compute inverse FFT  * 1/n
  ;; convert back to array and return answer
  (let* ((lr (length r))
	 (ls (length s))
	 (lans (cl:+ lr ls -1))
	 (z (ash 1 (ceiling (cl:log lans 2)))) ; round up to power of 2
	 (rfft (four1 (v2dfa r z) z))
	 (sfft (four1 (v2dfa s z) z))
	 (ans (make-array (* 2 z))))
    (declare (fixnum z))
    (dotimes (i (* 2 z))(setf (aref ans i) (copy_mac *zz*)))
    (prodarray rfft sfft z ans)
    (dfa2v(four1 ans z :isign -1) lans)))

(defun polysquare(r) 
  ;;just square the polynomial r. saves one fft. so you can compare times.
  (let* ((lr (length r))
	 (lans (+ lr lr -1))
	 (z (ash 1 (ceiling (log lans 2)))) ; round up to power of 2
	 (rfft (four1 (v2dfa r z) z))
	 (ans (make-array (* 2 z))))
    (prodarray rfft rfft z ans)
    (dfa2v(four1 ans z :isign -1) lans)))

#+ignore
(defun polytime(r s)  ;; this shows that much of the time is in v2dfa etc
  (let* ((lr (length r))
	 (ls (length s))
	 (lans (+ lr ls -1))
	 (z (ash 1 (ceiling (log lans 2)))) ; round up to power of 2
	 (r1 (v2dfa r z))
	 (s1 (v2dfa s z))
	 (sfft 0)
	 (rfft  (four1 r1 z)))
    (start-profiler)
  (time (progn (setf sfft (four1 s1 z))
    (setf sfft (four1 s1 z))
    (setf sfft (four1 s1 z))
    (setf sfft (four1 s1 z))
    (setf sfft (four1 s1 z))))
    (show-flat-profile)
	 ;;(prod (prodarray rfft sfft z ))
	 ;;(ans (time(four1 prod z :isign -1)))
	 )
    ;;(dfa2v ans lans)
    )

#+ignore
(defun polytime2(r)  ;; this shows that much of the time is in v2dfa etc
  (let* ((lr (length r))
	 (lans (+ lr lr -1))
	 (z (ash 1 (ceiling (log lans 2)))) ; round up to power of 2
	 (r1 (v2dfa r z))
	 (start-profiler)
	 (rfft (time (four1 r1 z)))
	 (prod (prodarray rfft rfft z  rfft))
	 (ans (four1 prod z :isign -1)))
    (show-flat-profile)
  (dfa2v ans lans)))

;; Utility to copy an array because this fft will clobber input.
;; We don't use it here, but here's a way to write it.

;;(defun copyarray(a)			; of octs
;;  (map 'array #'(lambda(r)(copy_mac r)) a))

(defun prodarray(r s len ans)
  ;; r and s are the same length arrays
  ;; compute, for i=0, 2, ..., len-2
  ;; ans[i]:=  r[i]*s[i]-r[i+1]*s[i+1] ;; real part
  ;; ans[i+1]:=r[i]*s[i+1]+s[i]*r[i+1] ;; imag part
  (declare (fixnum len)(optimize (speed 3)(safety 1)))
  (let ()
    ;;((ans (make-array (* 2 len))))
  (dotimes (i len ans)
    (let* ((ind (* 2 i))
	   (ind1 (1+ ind))
	   (a (aref r ind))
	   (b (aref r ind1))
	   (c (aref s ind))
	   (d (aref s ind1)))
      (declare ;(type aqd a b c d)
       (fixnum i ind ind1))
       (setf (aref ans ind)(copy(with-temps (- (* a c)(* b d)))))
       (setf (aref ans ind1)(copy(with-temps (+ (* a d)(* b c)))))
   ))))

(defparameter oct2pi (make-oct :real octi::+oct-2pi+))
(defparameter octpi/2 (make-oct :real octi::+oct-pi/2+))

(defparameter onehalf (into 1/2))

;;0.62831853071795864769252867665590057683943387987502116419498891846Q1

;;; this is a fairly generic FFT that works, but not optimized much.
;;; we leave it here just in case you want to copy it for other 
;;; generic arithmetic packages
;;#+ignore  
;; this works, but is not optimized 
(defun copy(x)(if (oct-p x)(copy_mac x)(copy-tree x))) ;; whatever..
#+ignore
(defun four1 (data nn &key (isign 1))
  (declare (type fixnum nn isign))
 (prog ((wr (copy *zz*)) 
	(wi (copy *zz*)) 
	(wpr (copy *zz*))
	(wpi (copy *zz*))
	(wtemp (copy *zz*)) 
        (theta (copy *zz*)) 
	(tempr (copy *zz*)) 
	(tempi (copy *zz*))
	(j 0) (n 0) (m 0) (mmax 0) (istep 0))
   (declare
	;;(type aqd wr wi wpr wpi wtemp theta tempr tempi)
    (fixnum j n m mmax istep))
   
  (setf n (* 2 nn)) 
  (setf j 1) 
  (do ((i 1 (+ i 2)))
      ((> i n) t)
      (declare (fixnum i))
    (when (> j i) 
     (setf tempr (aref data (1- j)))
     (setf tempi (aref data j)) 
     (setf (aref data (1- j)) (aref data (1- i)))
     (setf (aref data j) (aref data i)) 
     (setf (aref data (1- i)) tempr)
     (setf (aref data i) tempi))
    (setf m (floor (/ n 2)))
 label1
    (when (and (>= m 2) (> j m))
     (setf j (- j m)) (setf m (floor (/ m 2)))
     (go label1))
    (setf j (+ j m))) 
  (setf mmax 2) 
 label2 
  (when (> n mmax)
    (setf istep (cl::* 2 mmax))
    (setf theta  (/ oct2pi (* isign mmax)))
    (setf wpr  (* -2 (expt (sin (* 1/2 theta)) 2)))
    (setf wpi (sin theta)) (setf wr (copy *one*)) (setf wi (copy *zz*))
    (do ((m 1 (+ m 2)))
	((cl::> m mmax) t)
      (declare (fixnum  m))
      (do ((i m (+ i istep)))
	  ((> i n) t)
	(declare (type fixnum i))
	(setf j (+ i mmax))
	(setf tempr  (- (* wr (aref data (1- j)))
				    (* wi (aref data j))))
	(setf tempi  (+ (* wr (aref data j))
				    (* wi (aref data (1- j)))))
	(setf (aref data (1- j)) (- (aref data (1- i)) tempr))
	(setf (aref data j) (- (aref data i) tempi))
	(setf (aref data (1- i)) (+ (aref data (1- i)) tempr))
	(setf (aref data i) (+ (aref data i) tempi)))
      (setf wtemp wr)
      (setf wr (+  (* wr wpr) (* (* -1 wi) wpi) wr))
      (setf wi (+  (* wi wpr) (* wtemp wpi) wi)))
    (setf mmax istep)
    (go label2)) 
   (return data)))

;; hacking four1 for speed, keeping space consumption down

(defun four1 (data nn &key (isign 1))
  (declare (type fixnum nn isign) (optimize (speed 3)(safety 1)))
  (prog ((wr (copy_mac *zz*)) 
	 (wi (copy_mac *zz*)) 
	 (wpr (copy_mac *zz*))
	 (wpi (copy_mac *zz*))
	 (wtemp (copy_mac *zz*)) 
	 (theta (copy_mac *zz*)) 
	 (halftheta (copy_mac *zz*)) 
	 (cost (copy_mac *zz*))
	 (tempr (copy_mac *zz*)) 
	 (tempi (copy_mac *zz*))
	 (one (copy_mac *one*))
	 (zero (copy_mac *zz*))
	 (temprx 0) (tempix 0)
	 (j 0) (n 0) (m 0) (mmax 0) (istep 0))
	(declare  (fixnum j n m mmax istep))
	(setf n (cl:* 2 nn)) 
	(setf j 1) 
	(do ((i 1 (cl:+ i 2)))
	    ((cl:> i n))
	  (declare (fixnum i))
	  (when (cl:> j i) 
	    (setf temprx (aref data (cl:1- j)))
	    (setf tempix (aref data j))
	    (setf (aref data (cl:1- j)) (aref data (cl:1- i)))
	    (setf (aref data j) (aref data i)) 
	    (setf (aref data (cl:1- i)) temprx)
	    (setf (aref data i) tempix))
	  (setf m (cl:floor n 2))
	  label1
	  (when (and (cl:>= m 2) (cl:> j m))
	    (setf j (cl:- j m)) (setf m (cl:floor m 2))
	    (go label1))
	  (setf j (cl:+ j m))) 
	(setf mmax 2) 
	label2 
	(when (cl:> n mmax)
	  (setf istep (cl:* 2 mmax))
	  (dsetv theta (into (cl:* isign mmax)))
	  (dsetv theta (/ oct2pi theta))
	  (dsetv halftheta (* 1/2 theta))
	  (octi::sincos-oct-t (oct-real halftheta)(oct-real wpr)(oct-real cost))
	 (dsetv wpi  (* 2(* wpr cost))) ;; 2*sin(t/2)*cos(t/2)
	 (dsetv wpr  (* -2 (* wpr wpr)))
	 (dsetv wr one)
	 (dsetv wi zero)
	  (do ((m 1 (cl:+ m 2)))
	      ((cl:> m mmax) t)
	    (declare (fixnum m))
	    (do ((i m (cl:+ i istep)))
		((cl:> i n) t)
	      (declare (fixnum i))
	      (setf j (cl:+ i mmax))

	      (dsetv tempr  (- (* wr (aref data (cl:1- j)))
			       (* wi (aref data j))))
	      
	      (dsetv tempi (+ (* wr (aref data j))
			      (* wi (aref data (cl:1- j)))))

	      (setf (aref data (cl:1- j)) (- (aref data (cl:1- i)) tempr))
	      ;; (dsetv (aref data (cl:1- j)) (- (aref data (cl:1- i)) tempr))
	      
	      (setf (aref data j) (- (aref data i) tempi))
	      ;;(dsetv (aref data j) (- (aref data i) tempi))
	      
	      (dsetv (aref data (cl:1- i))   
		     (+ (aref data (cl:1- i)) tempr))
	      (dsetv (aref data i) (+  (aref data i) tempi)))
	    (dsetv wtemp wr)
	    (dsetv wr  (+  (* wr wpr) (* (- wi) wpi) wr))
	    (dsetv wi  (+  (* wi wpr) (* wtemp wpi) wi))  )
	  (setf mmax istep)
	  (go label2)) 
    (return data)))
)  

#|  what the answer should be ...
(defun t1()(polymultfft #(1  2 3 4 5 6) #(7 8 9)));; test
(t1)
#(7 22 46 70 94 118 93 54)
(four1 (v2dfa #(1 2 3 4 5 6) 16) 16) ;; except higher precision
#(21.0d0 0.0d0 4.203712543852026d0 17.12548253340269d0
  -9.656854249492387d0 2.999999999999995d0 1.4918055861931159d0
  -4.857754915068683d0 2.999999999999997d0 4.0d0 ...)  ;; works with unoptimized..

(defun randpol(n m) ;; array of ints
  (let ((ans (make-array n))
	(lim (expt 2 m)))
    (dotimes (i n ans)(setf (aref ans i) (random lim)))))

(setf r (randpol 512 332))
(time (progn (polymultfft r r) nil))
(time (progn (polymultfft qr qr) nil))
(setf s (make-array 512 :initial-element 1))
(time (polymultfft s s) ))
;; This is similar in speed to wxmaxima in GCL:
;; s:rat(sum(x^i,i,0, 511),x)$
;; s*s$  ;; time is .6 sec, vs .8 sec for fft.



|#