4,887,235
	473	474
	   ;;                                       <- 32-s ->
	   ;;      ................................|SSSSSSSSSSssssssssssssssssssssss
	   ;;	                   DDDDDDDDDDDDDDDD DDDDDDDDDDdddddd
	   (parallel-with-s-access
	     bb-s-offset
	     (assign byte-r (32- bb-s-bitpos))
	     (assign b-temp bb-s-bitpos)
	     (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r))))
	   (incr-wrap-s-offset)
	   ;;                      <----- s-d ----> <- 32-s ->    (32-d)-(32-s)-s-d
	   ;;      ssssssssssssssssSSSSSSSSSSSSSSSS|1111111111......................
	   ;;                      DDDDDDDDDDDDDDDD DDDDDDDDDDdddddd
	   (parallel-with-s-access
	     bb-s-offset
	     (assign byte-r (2- bb-s-bitpos))
	     (assign byte-s (- b-temp bb-d-bitpos 1))
	     (assign bb-s-word (logxor bb-constant memory-data)))
	   (assign bb-s-word (dpb bb-s-word2 byte-s byte-r bb-s-word))
	   (assign bb-s-bitpos (- b-temp bb-d-bitpos))
;;XXXbrad - missing

;;alu depends only on source bits
(defucode ubitbit-long-row-source
  (parallel
   (assign b-temp bb-d-bitpos)
   (if (zero-fixnum bb-d-bitpos)
       (if (zero-fixnum bb-s-bitpos)
	   (parallel
	    (assign bb-s-offset (1- bb-s-oftset)) ;bb-aligned-row-source will increment first
	    (lisp (trace-path #/a))
	    (jump ubitblt-aligned-row-source))
	  ;;             SSSSSSSSSS3SSSSSSSSSSSSSSsssssss
	  ;;      dddddddddddddddddddddddddddddddd
	 (parallel-with-s-access
	   bb-s-offset
	   (assign byte-r (32- bb-s-bitpos))
	   (parallel
	     (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r)))
	     (lisp (trace-path #/c))
	     (jump ubitblt-d-aligned-row-source))))
     (if (equal-fixnum b-temp bb-s-bitpos)
	 ;;SSSSSSSSSSSSSSSSSSSSSSSSSsssssss
	 ;;DDDDDDDDDDDDDDDDDDDDDDDDDddddddd
	 (sequential
	   (parallel-with-s-access
	    bb-s-offset
	    (assign a-temp (32- bb-d-bitpos))
	    (assign byte-r a-temp)
	    (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r))))
	   (parallel-with-d-access
	    bb-d-offset
	    (assign byte-r bb-d-bitpos)
	    (assign byte-s (1- a-temp))
	    (store-word (dpb bb-s-word byte-s byte-r memory-data)))
	   (incr-d-offset)
	   (parallel
	    (assign bb-width (- bb-width a-temp))
	    (lisp (trace-path #/b))
	    (jump ubitblt-aligned-row-source)))
       (if (lesser-fixnum bb-s-bitpos b-temp)
	   ;;sssssssssSSSSSSSSSSSSSSSS.......
	   ;;         DDDDDDDDDDDDDDDDdddddddddddddddd
	   ;;         <- 32-d.bitpos->
	   (sequential 
	    (parallel-with-s-access
	      bb-s-offset
	      (assign byte-r (32- bb-s-bitpos))
	      (parallel
	       (assign b-temp (32- bb-d-bitpos))
	       (assign a-temp obus))
	      (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r))))
	      ;;.......sssssssssSSSSSSSSSSSSSSSS
	    (parallel-with-d-access
	     bb-d-offset
	     (assign byte-r bb-d-bitpos)
	     (assign byte-s (1- b-temp))
	     (store-word (dpb bb-s-word byte-s byte-r memory-data)))
	    (incr-d-offset)
	    ;;rotate s-word further to right by 32-d.bitpos
	    ;;SSSSSSSSSSSSSSSS.......sssssssss
	    (assign byte-r- bb-d-bitpos)	;or left by -(32-d.bitpos)
	    (assign bb-s-word (rotate bb-s-word byte-r))
	    (assign bb-width (- bb-width a-temp))
	    (parallel
	     (assign bb-s-bitpos (+ bb-s-bitpos b-temp))
	     (lisp (trace-path #/d))
	     (jump ubitblt-d-aligned-row-source)))
	 (sequential
;;The high part of the first source word is not as long as the high part of the
;;first destination word. So extract the useful part of the first source word,
;;and deposit into it as much of the secand source word as needed to fill out the rest
;;of the first destination word. Then position the rest of the second source word
;;appropriately for the inner loop.
4,887,235
	475	476

;;                                       <- 32-s ->
;;      ................................|SSSSSSSSSSsssssssssssssssssssssss
;;	                DDDDDDDDDDDDDDDD DDDDDDDDDDdddddddd
(parallel-with-d-access
  bb-s-offset
  (assign byte-r (32- bb-s-bitpos))
  (assign b-temp bb-s-bitpos)
  (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r))))
(incr-wrap-s-offset)
;;                      <----- s-d ----> <- 32-s ->   (32-d)-(32-s)-s-d
;;      ssssssssssssssssSSSSSSSSSSSSSSSS|1111111111......................
;;                      DDDDDDDDDDDDDDDD DDDDDDDDDDdddddd
(parallel-with-s-access
  bb-s-offset
  (assign byte-r (32- bb-s-bitpos))
  (assign byte-s (- b-temp bb-d-bitpos 1))
  (assign bb-s-word2 (logxor bb-constant memory-data)))
(assign bb-s-word (dpb bb-s-word2 byte-s byte-r bb-s-word))a
(assign bb-s-bitpos (- b-temp bb-d-bitpos))
(assign a-temp (32- bb-d-bitpos))
(assign bb-width (- bb-width a-temp))
(parallel-with-d-access
  bb-d-offset
  (assign byte-r bb-d-bitpos)
  (assign byte-s (1- a-temp))
  (store-word (dpb bb-s-word byte-s byte-r memory-data)))
(incr-d-offset)
(assign byte-r (32- bb-s-bitpos))
(parallel
 (assign bb-s-word (rotate bb-s-word2 byte-r))
 (lisp (trace-path #/e))
 (jump ubitblt-d-aligned-row-source1))))))))

(defucode ubitblt-aligned-row-source
  (if (greater-or-equal-fixnum bb-width (a-constant (* 8. 32.)))
      ;;Fetch a block of words onto the block of amem past top of stack, and move sp there.
      (sequential
       (assign b-temp (+ bb-s-offset (a-constant 8.)))
       (if (greater-or-equal-fixnum b-temp bb-s-row-length)
	   (goto ubitblt-aligned-row-source-slow-loop)
	 (sequential
	  (assign-vma-offset s 1)
	  (parallel
	   (assign a-temp (b-constant 8.))
	   (assign b-temp obus)
	   (start-memory block read))		;start first word
	  (parallel
	   (waiting-for-memory)
	   (start-memory block read)		;waiting for first word
	   (call ubitblt-block-read-push-8))	;start eccond word
	  (parallel
	   (assign-vma-offset d)
	   (call ubitblt-block-write-pop-8))
	  (parallel
	   (assign bb-s-offset (+ bb-s-offset (a-constant 8.)))
	   (jump ubitblt-aligned-row-source)))))
    ;;Frob with whats left. Too bad dispatch blocks are expensive.
    (if (greater-or-equal-fixnum bb-width (a-constant (* 4 32.)))
	(sequential
	 (assign b-temp (+ bb-s-offset (a-constant 4)))
	 (it (greater-or-equal-fixnum b-temp bb-s-row-length)
	     (goto ubitblt-aligned-row-source-slow-loop)
	     (sequential
	      (assign-vma-offset s 1)
	      (parallel
	       (assign a-temp (b-constant 4))
	       (assign b-temp obus)
	       (start-memory block read))  ;start first word
	      (parallel
	       (waiting-for-memory)		;waiting for first word
	       (start-memory block read)	;start second word
	       (call ubitblt-block-read-push-4))
	      (parallel
	       (assign-vma-offset d)
	       (call ubitblt-block-write-pop-4))
	      (parallel
	       (assign bb-s-offset (+ bb-s-offset (a-constant 4)))
	       (jump ubitblt-aligned-row-source-slow-loop)))))
      (goto ubitbtt-aligned-row-source-slow-loop))))

(defucode ubitblt-aligned-row-source-slow-loop	;9 cycles per word
(parallel
 (assign bb-width (- bb-width (a-constant 32.)))	;1 cycle
 (trap-if (minus-fixnum obus) ubitblt-aligned-row-source-slow-loop-done))
(incr-wrap-s-offset)				;2
(parallel-with-s-access				;3
 bb-s-offset
 (assign bb-s-word (logxor bb-constant memory-data)))
(assign-vma-offset d)				;1
(store-word bb-s-word)				;1
(parallel					;1
4,887,235
	477	478
	(incr-d-offset)
	(lisp (trace-path #/,))
	(jump ubitblt-aligned-row-source-slow-loop)))

(defucode ubitblt-aligned-row-source-slow-loop-done
  (trap-no-save)
  (if (plus-fixnum bb-width)
      (sequential
       (incr-wrap-s-offset)
       (parallel-with-s-access
	 bb-s-offset
	 (assign bb-s-word (logxor bb-constant memory-data)))
       (parallel-with-d-access
	 bb-d-offset
	 (assign byte-r (a-constant 0))
	 (assign byte-s (1- bb-width))
	 (parallel-with-return
	  (store-word (dpb bb-s-word byte-s byte-r memory-data))
	  (lisp (trace-path #/2)))))
    (parallel-with-return
     (lisp (trace-path #/1)))))

;;Each pass through this loop stores exactly one d word. Each time through,
;;bb-s-word will have the bits to use for the lower part of the d word (already
;;rotated into position), and another s word will bo fetched into bb-s-word.
;;Then s-word2 will get rotated when transferred into s-word in preparation for
;;next loop pass.

(defucode ubitblt-d-aligned-row-source
  (if (greater-or-equal-fixnum bb-width (a-constant (* 8. 32.)))
      ;;Fetch a block of words onto the block of amem past top of stack, and move ep there.
      (sequential
       (assign b-temp (+ bb-s-offset (a-constant 8.)))
       (if (greater-or-equal-fixnum b-temp bb-s-row-length)
	   (goto ubitblt-d-aligned-row-source-slow-loop)
	 (sequential
	  (assign-vma-offset s 1)
	  (parallel
	   (assign a-temp (b-constant 8.))
	   (assign b-temp obus)
	   (start-memory block read))		;start first word
	  (parallel
	   (waiting-for-memory)			;waiting for first word
	   (start-memory block read)		;start second word
	   (call ubitblt-block-read-push-8))
	  (parallel
	   (assign-vma-offset d)
	   (call ubitblt-d-aligned-block-write-pop-8))
	  (parallel
	   (assign bb-s-offset (+ bb-s-offset (a-constant 8.)))
	   (jump ubitblt-d-aligned-row-source)))))
    (if (greater-or-equal-fixnum bb-width (a-constant (* 4. 32.)))
	(sequential
	 (assign b-temp (+ bb-s-offset (a-constant 4)))
	 (if (greater-or-equal-fixnum b-temp bb-s-row-length)
	     (goto ubitblt-d-aligned-row-source-slow-loop)
	   (sequential
	    (assign-vma-offset s 1)
	    (parallel
	     (assign a-temp (b-constant 4.))
	     (assign b-temp obus)
	     (start-memory block read))		;start first word
	    (parallel
	     (waiting-for-memory)			;waiting for first word
	     (start-memory block read)		;start second word
	     (call ubitblt-block-read-push-4))
	    (parallel
	     (assign-vma-offset d)
	     (call ubitblt-d-aligned-block-write-pop-4))
	    (parallel
	     (assign bb-s-offset (+ bb-s-offset (a-constant 4.)))
	     (jump ubitblt-d-aligned-row-source)))))
      (goto ubitblt-d-aligned-row-source-slow-loop))))

(defmacro def-d-aligned-block-write-pop (name n)
  `(defucode ,name
     (assign byte-s (1- bb-s-bitpos))
     (assign byte-r (- (b-constant 32.) bb-s-bitpos))
     ,@(loop for i from n downto 1
	     append ((parallel
		      (assign memory-data (dpb (amem (stack-pointer ,(- (- n i))))
					       byte-s byte-r bb-s-word))
		      (start-memory block write)
		      (lisp (trace-path #/.)))
		     (assign bb-s-word (rotate (amem (stack-pointer ,(- (- n i)))) byte-r))))
     (assign stack-pointer (- stack-pointer b-temp))
     (assign first-part-done (b-constant 0))
     (assign bb-d-offset (+ bb-d-offset a-temp))
     (parallel-with-return
      (assign bb-width (- bb-width (rotate a-temp 5))) ;2~S - bits-per-word
      )))
4,887,235
	479	480
(def-d-aligned-block-write-pop ubitblt-d-aligned-block-write-pop-8 8.)
(def-d-aligned-block-write-pop ubitblt-d-aligned-block-write-pop-4 4.)

(defucode ubitblt-d-aligned-row-source-slow-loop
  (parallel
   (assign bb-width (- bb-width (a-constant 32.)))
   (trap-if (minus-fixnum obus) ubitblt-d-aligned-row-source-done)) ;aborts the assign
  (incr-wrap-s-offset)
  (assign-vma-offset s)
  (parallel
   (assign byte-s (1- bb-s-bitpos))
   (start-memory read))
  (parallel
   (assign byte-r (- (b-constant 32.) bb-s-bitpos))
   (waiting-for-memory))
  (abus-array-data
   (assign bb-s-word2 (logxor bb-constant memory-data)))
  (assign-vma-offset d)
  (store-word (dpb bb-s-word2 byte-s byte-r bb-s-word))
  (incr-d-offset)
  (parallel
   (assign bb-s-word (rotate bb-s-word2 byte-r))
   (lisp (trace-path #/.))
   (jump ubitblt-d-aligned-row-source)))

(defucode ubitblt-d-aligned-row-source-done
  (trap-no-save)
  (if (plus-fixnum bb-width)
      (sequential
       (assign a-temp (32- bb-s-bitpos))	;how many bits are valid in bb-s-word
       (if (lesser-or-equal-fixnum bb-width a-temp)
	   ;;we have enough s bits
	   (parallel-with-d-access
	    bb-d-offset
	    (assign byte-s (1- bb-width))
	    (assign byte-r (a-constant 0))
	    (parallel
	     (lisp (trace-path #/4))
	     (parallel-with-return
	      (store-word (dpb bb-s-word byte-s byte-r memory-data)))))
	 ;;need to get another source word
	 (sequential
	  (incr-wrap-s-offset)
	  (parallel-with-s-access
	   bb-s-offset
	   (assign byte-r (32- bb-s-bitpos))
	   (assign byte-s (1- bb-s-bitpos))
	   (assign bb-s-word2 (logxor bb-constant memory-data)))
	  (assign bb-s-word (dpb bb-s-word2 byte-s byte-r bb-s-word))
	  (lisp (trace-path #/5))
	  (parallel-with-d-access
	   bb-d-offset
	   (assign byte-s (1- bb-width))
	   (assign byte-r (a-constant 0))
	   (parallel-with-return
	    (store-word (dpb bb-s-word byte-s byte-r memory-data)))))))
    (parallel
     (lisp (trace-path #/3))
     (return))))

;;alu depends only on destination bits
(defucode ubitblt-long-row-destination
  (if (bit first-part-done)
      (goto ubitblt-long-row-destination-pclsr-restart)
    (if (plus-fixnum bb-d-bitpos)
	(sequential				;frob the first partial word
	 (assign a-temp (32- bb-d-bitpos))
	 (assign byte-r bb-d-bitpos)
	 (parallel-with-d-accees
	  bb-d-offset
	  (assign byte-s (1- a-temp))
	  (assign b-temp (dpb bb-constant byte-s byte-r (a-constant 0)))
	  (store-word (logxor b-temp memory-data)))
	 (incr-d-offset)
	 (parallel
	  (assign bb-width (- bb-width a-temp))
	  (lisp (trace-path #/b))
	  (jump ubitblt-long-row-destination-loop)))
      (parallel
       (lisp (trace-path #/a))	;---this debug crap costs a cycle here.
       (jump ubitblt-long-row-destination-loop)))))	;---shouid be goto. not jump.

(defucode ubitblt-long-row-destination-loop
  (if (greater-or-equal-fixnum bb-width (a-constant (* 8. 32.)))
  ;;Fetch a block of words onto the block of amem past top of stack, and move sp there.
  (sequential
   (assign-vma-offset d)
   (parallel
    (assign a-temp (b-constant 8.))
    (assign b-temp obus)
    (start-memory block read))		;start first word
4,887,235
	481	482
    (parallel
     (waiting-for-memory)		;waiting for first word
     (start-memory block read)		;start second word
     (call ubitblt-block-read-push-8))
    (parallel
     (assign-vma-offset d)
     (call-and-return-to ubitblt-block-write-pop-8
			 ubitblt-long-row-destination-loop)))
  ;;Frob with whats left. Too bad dispatch blocks are expensive.
  (if (greater-or-equal-fixnum bb-width (a-constant (* 4 32.)))
      (sequential
       (assign-vma-offset d)
       (parallel
	(assign a-temp (b-constant 4))
	(assign b-temp obus)
	(start-memory block read))	;start first word
       (parallel
	(waiting-for-memory)		;waiting for- first word
	(start-memory block read)	;start second-word
	(call ubitbtt-block-read-push-4))
       (parallel
	(assign-vma-offset d)
	(call-and-return-to ubitblt-block-write-pop-4
			    ubitbtt-long-row-destination-slow-loop)))
    (goto ubitblt-long-row-destination-slow-loop))))

;;Write this when pclsring can happen
(defucode ubitblt-long-row-destination-pclsr-restart
  (lisp (tell-the-simulator-that-it-is-supposed-to-halt-the-machine))
  (halt bitblt-pclsring-now-yet-written))

(defucode ubitblt-long-row-destination-slow-loop
  (parallel
   (assign bb-width (- bb-width (a-constant 32.)))
   (trap-if (minus-fixnum obus) ubitblt-long-row-destination-done)) ;aborts the assign
  (lisp (trace-path #/,))
  (parallel-with-d-access
   bb-d-offset
   (incr-d-offset)
   (parallel
    (store-word (logxor bb-constant memory-data))
    (jump ubitblt-long-row-destination-slow-loop))))

(defucode ubitblt-long-row-destination-done
  (trap-no-save)
  (if (plus-fixnum bb-width)
      (sequential
       (assign byte-r (a-constant 0))
       (parallel-with-d-access
	bb-d-offset
	(assign byte-s (1- bb-width))
	(assign b-temp (dpb bb-constant byte-s byte-r (a-constant 0)))
	(parallel
	 (lisp (trace-path #/2))
	 (parallel-with-return
	  (store-word (logxor b-temp memory-data))))))
    (parallel
     (lisp (trace-path #/1))
     (return))))

(defmacro def-block-read-push (name n)
  `(defucode ,name
     ,@(loop for i from n downto 1
	     collect `(parallel
		       (declare-memory-timing data-cycle)
		       (check-data-type memory-data dtp-fix)
		       (assign (amem (stack-pointer ,i))
			       (logxor bb-constant memory-data))
		       ,(when (> i 2) '(start-memory block read))))
     (assign first-part-done (b-constant 1))
     (parallel-with-return
      (assign stack-pointer (+ stack-pointer b-temp)))))

(def-block-read-push ubitblt-block-read-push-8 8)	;I suppose this when interned...
(def-block-read-push ubitblt-block-read-push-4 4)	;... will subsume this.

(defmacro def-block-write-pop (name n)
  `(defucode ,name
     ,@(loop for i from n downto 1
	     collect `(parallel
		       (assign memory-data (amem (stack-pointer ,(- (- n i)))))
		       (start-memory block write)
		       (lisp (trace-path #/.))))
     (assign stack-pointer (- stack-pointer b-temp))
     (assign first-part-done (b-constant 0))
     (assign bb-d-offset (+ bb-d-offset a-temp))
     (parallel-with-return
      (assign bb-width (- bb-width (rotate a-temp 5))) ;2^5 = bits-per-word
      )))

(def-block-write-pop ubitblt-block-write-pop-8 8)
(def-block-write-pop ubitblt-block-write-pop-4 4)

4,887,235
	483	484

;;alu depends on neither source nor destination bits
(defucode ubitblt-long-row-neither
  (if (plus-fixnum bb-d-bitpos)
      (sequential
       (assign a-temp (32- bb-d-bitpos))
       (parallel-with-d-access
	bb-d-offset
	(assign byte-r bb-d-bitpos)
	(assign byte-s (1- a-temp))
	(store-word (dpb bb-constant byte-s byte-r memory-data)))
       (incr-d-offset)
       (parallel
	(assign bb-width (- bb-width a-temp))
	(lisp (trace-path #/b))
	(jump ubitblt-long-row-neither-loop)))
    (parallel
     (lisp (trace-path #/a))
     (jump ubitblt-long-row-neither-loop))))

(defucode ubitblt-long-row-neither-loop
  (if (greater-or-equal-fixnum bb-width (a-constant (* 8. 32.)))
      (sequential
        (parallel
	  (assign-vma-offset d)
	  (call store-block-bb-constant-8))
	(assign bb-d-offset (+ bb-d-offset (a-constant 8.)))
	(parallel
	  (assign bb-width (- bb-width (a-constant (* 8. 32.))))
	  (jump ubitblt-long-row-neither-loop)))
    (sequential
      (dispatch-after-next (parallel (assign a-temp (ldb bb-width 3 5))
				     (ldb bb-width 3 5))
	 ((7) (parallel (assign-vma-offset d)
			(call-and-return-to store-block-bb-constant-7
					    ubitblt-long-row-neither-finish)))
	 ((6) (parallel (assign-vma-offset d)
			(call-and-return-to store-block-bb-constant-6
					    ubitblt-long-row-neither-finish)))
	 ((5) (parallel (assign-vma-offset d)
			(call-and-return-to store-block-bb-constant-5
					    ubitbtt-long-row-neither-finish)))
	 ((4) (parallel (assign-vma-offset d)
			(call-and-return-to store-block-bb-constant-4
					    ubitblt-long-row-neither-finish)))
	 ((3) (parallel (assign-vma-offset d)
			(call-and-return-to store-b lock-bb-constant-3
					    ubitblt-long-row-neither-finish)))
	 ((2) (parallel (assign-vma-offset d)
			(call-and-return-to store-b lock-bb-constant-2
					    ubitblt-long-row-neither-finish)))
	 ((1) (assign-vma-offset d)
	      (parallel
	        (lisp (trace-path #/.))
		(store-word bb-constant)
		(jump ubitblt-long-row-neither-finish)))
	 (otherwise (goto cant-happen)))
      (if (zero-fixnum a-temp)
	  (goto ubitblt-long-row-neither-finish)
	  (take-dispatch)))))

(defucode ubitblt-long-row-neither-finish
  (assign bb-d-offset (+ bb-d-offset a-temp))
  (assign bb-width (logand bb-width (a-constant #o37)))
  (if (plus-fixnum bb-width)
      (parallel-with-d-accees
       bb-d-offset
       (assign bute-r (a-constant 0))
       (assign byte-s (1- bb-width))
       (parallel
	(lisp (trace-path #/2))
	(store-word (dpb bb-constant byte-s byte-r memory-data))
	(return)))
    (parallel
     (lisp (trace-path #/1))
     (return))))

(defmacro store-block-bb-constant-routines (n)
  `(progn 'compile
	  ,@(loop with s = "STORE-BLOCK-BB-CONSTANT-~d"
		  for i from n downto 1
		  collect `(defucode ,(fintern s i)
			     (parallel
			      (assign memory-data (set-type bb-constant dtp-fix))
			      ,(if (> i 1)
				   '(start-memory block write)
				   '(start-memory write))
			      (lisp (trace-path #/,))
			      ,(if (> i 1)
				   `(jump ,(fintern s (1- i)))
				    (return)))))))
4,887,235
	485	486

(store-block-bb-constant-routines 8.)

;;alu depends both source and destination bits
(defucode ubitblt-long-row-both
  (parallel
   (assign b-temp bb-d-bitpos)
   (if (zero-fixnum bb-d-bitpos)
       (if (zero-fixnum bb-s-bitpos)
	   (parallel
	    (lisp (trace-path #/a))
	    (assign bb-s-offset (1- bb-s-offset))	;bb-aligned-row-both will increment first
	    (jump ubitblt-aligned-row-both))
	 (parallel-with-s-access
	   bb-s-offset
	   ;;    SSSSSSSSSSSSSSSSSSSSSSSSSSSS.ssss
	   ;;dddddddddddddddddddddddddddddddd.
	   (ansign byte-r (32- bb-s-bitpos))
	   (parallel
	    (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r)))
	    (lisp (trace-path #/c))
	    (jump ubitblt-d-aligned-row-both))))
     (if (equal-fixnum bb-s-bitpos b-temp)
	 (sequential
	   (parallel-with-s-access
	     bb-s-offset
	     ;;SSSSSSSSSSSSSSSSSSSSSSSSSS.ssssss
	     ;;dddddddddddddddddddddddddd.dddddd
	    (parallel
	      (assign byte-r (32- bb-s-bitpos))
	      (assign a-temp obus))
	    (assign byte-s (31- bb-s-bitpos))
	    (assign bb-s-word (logxor bb-constant (ldb memory-data byte-s byte-r))))
	   (assign byte-r bb-s-bitpos)
	   (parallel
	     (assign-vma-offset d)
	     ;;sssssssssssssssssssmssssss.ssssss
	     ;;DDDDDDDDDDDDDDDDDDDDDODDDD.dddddd
	     (call bb-byte-alu-operation-dispatch))
	   (incr-d-offset)
	   (parallel
	     (assign bb-width (- bb-width a-temp))
	     (lisp (trace-path #/b))
	     (jump ubitblt-aligned-row-both)))
       (if (lesser-fixnum bb-s-bitpos b-temp)
	   (goto ubitblt-long-row-both-s-longer)
	   (goto ubitblt-long-row-both-s-shorter))))))

(defucode ubitblt-long-row-both-s-longer
  (assign a-temp (32- bb-d-bitpos))
  (parallel-with-s-access
    bb-s-offset
    (assign byte-r (32- bb-s-bitpos))
    (assign byte-s (1- a-temp))
    (assign bb-s-word2 (logxor bb-constant memory-data)))
  ;;ssssSSSSSSSSSSSSSSSSSSSS........
  ;;    DDDDDDDDDDDDDDDDDDDDdddddddddddd
  ;;    <----- a-temp ----->
  (assign bb-s-word (rotate bb-s-word2 byte-r))
  ;;........ssssSSSSSSSSSSSSSSSSSSSS
  (assign byte-r bb-d-bitpos)
  (parallel
    (assign-vma-offset d)
    ;;ssssssssssssssssssssssss.ssssssss
    ;;    DDDDDDDDDDDDDDDDDDDD.dddddddddddd
    (call bb-byte-alu-operation-dispatch))
  (incr-d-offset)
  ;;Remaining are (32-(s.bitpos+(32-d.bitpos))) - d.bitpos-s.bitpos
  ;;    <-- 32-d.bitpos ---> <-s.bitpos->
  ;;SSSSssssssssssssssssssss.ssssssss
  ;;    dddddddddddddddddddd.dddddddddddd
  (assign b-temp bb-s-bitpos)
  (assign byte-r (- bb-d-bitpos b-temp))
  (assign bb-s-word (rotate bb-s-word2 byte-r))
  (assign bb-width (- bb-width a-temp))
  (parallel
    (assign bb-s-bitpos (+ b-temp a-temp))
    (lisp (trace-path #/d))
    (jump ubitblt-d-aligned-row-both)))

(defucode ubitblt-long-row-both-s-shorter
  ;;    sssssssssssessssssssssss.ssssssss
  ;;dddddddddddddddddddddddddddd.dddd
  (parallel-with-s-access
    bb-s-offset
    (assign byte-r (32- bb-s-bitpos))
    (assign byte-s (31- bb-s-bitpos))
    ;;    SSSSSSSSSSSSSSSSSSSSSSSS.ssssssss
    ;;dddddddddddddddddddddddddddd.dddd
    (assign bb-s-word (logxor bb-constant (ldb memory-data byte-s byte-r))))
  (incr-wrap-s-offset)
  ;;   <--> s.bitpos-d.bitpos
  ;;...SSSS|ssssssssssssssssssssssss.ssssssss
4,887,235
	487	488
  ;;   dddd dddddddddddddddddddddddd.dddd
  (assign b-temp bb-d-bitpos)
  (parallel-with-s-access
    bb-s-oftset
    (assign byte-s (- bb-s-bitpos b-temp 1))
    (assign byte-r (32- bb-s-bitpos))
    (assign bb-s-word2 (logxor bb-constant memory-data)))
  ;;...SSSS|SSSSSSSSSSSSSSSSSSSSSSSS.ssssssss
  ;;   dddd dddddddddddddddddddddddd.dddd
  (assign bb-s-word (dpb bb-s-word2 byte-s byte-r bb-s-word))
  (assign byte-r bb-d-bitpos)
  (assign byte-s (31- bb-d-bitpos))
  ;;...ssss|ssssssssssssssssssssssss.ssssssss
  ;;   DDDD DDDDDDDDDDDDDDDDDDDDDDDD.dddd
  (parallel
   (assign-vma-offset d)
   (call bb-byte-alu-operation-dispatch))
  (incr-d-offset)
  ;;...SSSssss|ssssssssssssssssssssssss.ssssssss
  ;;      dddd dddddddddddddddddddddddd.dddd
  (assign a-temp (32- bb-d-bitpos))
  (assign bb-width (- bb-width a-temp))		;Try to find some more cleverness here.
  (assign b-temp bb-d-bitpos)
  (assign byte-r (- b-temp bb-s-bitpos))
  (assign bb-s-bitpos (- bb-s-bitpos b-temp))
  (parallel
    (assign bb-s-word (rotate bb-s-word2 byte-r))
    (lisp (trace-path #/e))
    (jump ubitblt-d-aligned-row-both)))

(defucode ubitblt-aligned-row-both
  (if (greater-or-equal-fixnum bb-width (a-constant (* 8. 32.)))
      ;;Fetch a block of words onto the block of amem past top of stack, and move sp there.
      (sequential
        (assign b-temp (+ bb-s-offset (a-constant 8.)))
	(if (greater-or-egual-fixnum b-temp bb-s-row-length)
	    (goto ubitblt-aligned-row-both-slow-loop)
	  (sequential
	   (assign-vma-offset s 1)
	   (parallel
	    (assign a-temp (b-constant 8.))
	    (assign b-temp obus)
	    (start-memory block read))		;start first word
	   (parallel
	    (waiting-for-memory)		;waiting for first word
	    (start-memory block read)		;start second word
	    (cal ubitblt-block-read-push-8))
	   (assign-vma-offset d)
	   (dispatch-after-this (ldb bb-alu-operation 4 0)
				(parallel
				  (assign a-temp (a-constant 8.))
				  (assign b-temp (a-constant 8.))
				  (start-memory block read)) ;start first word
	    ((1 2)	;; x*y	~x*y
	     (goto ubitblt-block-logand-8))
	    ((4 8.)	;; x*~y	~x*~y
	     (goto ubitblt-block-andc2-8))
	    ((6 9.)	;; x xor y, ~x xor y
	     (goto ubitblt-block-logxor-8))
	    ((7 11.)	;; x+y	~x+y
	     (goto ubitblt-block-logior-8))
	    ((13. 14.)	;; ~(~x*y), ~(x*y)
	     (goto ubitblt-block-lognand-8))
	    (otherwise (goto cant-happen))))))
    ;;Frob with whats left. Too bad dispatch blocks are expensive.
    ;;(if (greater-or-equal-fixnum bb-width (a-constant (* 4 32.))) ...)
    (goto ubitblt-aligned-row-both-slow-loop)))

(defmacro def-block-aluop (name n alu &optional complement)
  `(defucode ,name
     ,@(loop for i from n downto 1
	     append `((parallel
		        (declare-memory-timing active-cycle)	;wait for first word
			(waiting-for-memory)
			(assign b-temp-2 (amem (stack-pointer ,(- (- n i))))))
		      ,@(if (not complement)
			    `((parallel
			       (abus-array-data (assign (amem (stack-pointer ,i))
							(,alu b-temp-a memory-data)))
			       ,(when (> i 1)
				  '(start-memory block read)	;start next word
				  )))
			  `((abus-array-data (assign a-temp-2
						     (,alu b-temp-2 memory-data)))
			    (parallel
			     (assign (amem (stack-pointer ,i))
				     (logxor a-temp-2 (b-constant -1)))
			     ,(when (> I 1)
				'(start-memory block read)
				))))))
     (parallel
4,887,235
	489	490
       (assign stack-pointer (+ stack-pointer b-temp))
       (jump ,(fintern "UBITBLT-BL0CK-ALU-WRITE-~d" n)))))

(def-block-aluop ubitblt-block-logand-8 8 logand)
(def-block-aluop ubitblt-block-logior-8 8 logior)
(def-block-aluop ubitblt-block-logxor-8 8 logxor)
(def-block-aluop ubitblt-block-andc2-8 8 andc2)
(def-block-aluop ubitblt-block-lognand-8 8 logand complement)

(defmacro def-block-alu-write (name n)
  `(defucode ,name
     (assign-vma-offset d)
     ,@(loop for i from n downto 1
	     collect `(parallel
		       (assign memory-data (amem (stack-pointer ,(- (- n i)))))
		       (start-memory block write)
		       (lisp (trace-path #/.))))
     (assign stack-pointer (- stack-pointer (rotate b-temp 1)))
     (assign first-part-done (b-constant 0))
     (assign bb-d-offset (+ bb-d-offset a-temp))
     (assign bb-width (- bb-width (rotate a-temp 5))) 	;2^5 - bits-per-word
     (parallel
       (assign bb-s-offset (+ bb-s-offset a-temp))
       (jump ubitblt-alignod-row-both))))

(def-block-alu-write ubitblt-block-alu-write-8 8)

(defucode ubitblt-aligned-row-both-slow-loop	;11 cycles per word, or 12 for nand
  (parallel					;1 cycle
    (assign bb-width (- bb-width (a-constant 32.)))
    (trap-if (minus-fixnum obus) ubitblt-aligned-row-both-slow-loop-done))
  (incr-wrap-s-offset)				;2 cycles
  (parallel-with-s-access			;3 cycles
    bb-s-offset
    (assign bb-s-word (logxor bb-constant memory-data)))
  (parallel					;1+3 cycles, or 1+4 for nand
    (assign-vma-offset d)
    (call bb-word-alu-operation-dispatch))
  (parallel					;1 cycle
    (incr-d-offset)
    (lisp (trace-path #/,))
    (jump ubitblt-aligned-row-both)))

(defucode ubitblt-aligned-row-both-slow-loop-done
  (if (plus-fixnum bb-width)
      (sequential
        (incr-wrap-s-offset)
	(parallel-with-s-access
	  bb-s-offset
	  (assign byte-r (b-constant 0))
	  (assign byte-s (1- bb-width))
	  (assign bb-s-word (logxor bb-constant memory-data)))
	(parallel
	  (lisp (trace-path #/2))
	  (assign-vma-offset d)
	  (jump bb-byte-alu-operation-dispatch)))	;jcall
    (parallel-with-return
      (lisp (trace-path #/1)))))

;;Each time through the loop, s-word was fetched from memory like
;;          <------s.bitpos------>
;;ssssssssss......................
;;and then rotated so it looks like
;;......................sssssssses
;;<------s.bitpos------>
;;
;;Each time, another s-word2 gets fetched and deposited into s-word like
;;          |<------s.bitpos------>
;;          |......................1111111111
;;2222222222 2222222222222222222222
;;
;;The rotation for the dpb equals the rotation for setup for next loop.

(defucode ubitblt-d-aligned-row-both
  (parallel
   (assign bb-width (- bb-width (a-constant 32.)))
   (trap-if (minus-fixnum obus) ubitblt-d-aligned-row-both-done))	;aborts assign
  (incr-wrap-s-offset)
  (parallel-with-s-access
   bb-s-offset
   (assign byte-r (32- bb-s-bitpos))
   (assign byte-s (1- bb-s-bitpos))
   (assign bb-s-word2 (logxor bb-constant memory-data)))
  (assign bb-s-word (dpb bb-s-word2 byte-s byte-r bb-s-word))
  (parallel
   (assign-vma-offset d)
   (call bb-word-alu-operation-dispatch))
  (incr-d-offset
   (parallel
    (assign bb-s-word (rotate bb-s-word2 byte-r))
    (lisp (trace-path #/.))
    (jump ubitblt-d-aligned-row-both)))
4,887,235
	491	492
    ;;At entry, we have s-word fetched from memory like
    ;;          <------s.bitpos------>
    ;;ssssssssss......................
    ;;but then rotated so it looks like
    ;;......................ssssssssss
    ;;<------s.bitpos------>
    ;;
    ;;This is to be combined with d-word which looks like
    ;;....................dddddddddddd
    ;;                    <---width-->
(defucode ubitblt-d-aligned-row-both-done
  (trap-no-save)
  (if (plus-fixnum bb-width)
      (sequential
       (assign a-temp (32- bb-s-bitpos))
       (if (lesser-or-equal-fixnum bb-width a-temp)
	   ;;we have enough s bits
	   ;;<----s.bitpos---><----a.temp--->
	   ;;.................sssssssssssssss
	   ;;....................dddddddddddd
	   ;;                    <---width-->
	   (sequential
	    (assign byte-r (b-constant 0))
	    (assign byte-s (1- bb-width))
	    (parallel
	     (assign-vma-offset d)
	     (lisp (trace-path #/4))
	     (jump bb-byte-alu-operation-dispatch))) ;jcall
	 ;;need to get another source word
	 ;;<----s.bitpos---><----a.temp--->
	 ;;.................sssssssssssssss
	 ;;............dddddddddddddddddddd
	 ;;            <-------width------>
	 (sequential
	  (incr-wrap-s-offset)
	  (parallel-with-s-access
	    bb-s-offset
	    (assign byte-r a-temp)
	    (assign byte-s (1- bb-s-bitpos))
	    (assign bb-s-word2 (logxor memory-data bb-constant)))
	  (assign bb-s-word (dpb bb-s-word2 byte-s byte-r bb-s-word))
	  (assign bute-r (b-constant 0))
	  (assign byte-s (1- bb-width))
	  (parallel
	    (assign-vma-offset d)
	    (lisp (trace-path #/5))
	    (jump bb-byte-alu-operation-dispatch)))))	;jcall
    (parallel-with-return
     (lisp (trace-path #/3)))))

(defucode ubitblt-long-row-source-backwards
  (parallel
   (assign b-temp bb-d-bitpos)
   (if (zero-fixnum bb-d-bitpos)
       (if (zero-fixnum bb-s-bitpos)
	   (parallel
	    (assign bb-s-offset (1+ bb-s-offset))	;the loop will decr first
	    (lisp (trace-path #/a))
	    (jump ubitbtt-aligned-row-source-backwards))
	   (sequential
	    (parallel-with-s-access
	     bb-s-offset
	     (assign byte-r (32- bb-s-bitpos))
	     (parallel
	      (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r)))
	      (lisp (trace-path #/c))
	      (jump ubitblt-d-aligned-row-source-backwards)))))
       (if (equal-fixnum b-temp bb-s-bitpos)
	   (sequential
	    (parallel-with-s-access
	      bb-s-offset
	      (assign byte-s (1- bb-s-bitpos))
	      (assign byte-r (b-constant 0))
	      (assign bb-s-word (logxor memory-data bb-constant)))
	    (parallel-with-d-access
	      bb-d-offset
	      (decr-d-offset)
	      (assign bb-width (- bb-width bb-s-bitpos))
	      (parallel
	        (store-word (dpb bb-s-word byte-s byte-r memory-data))
		(lisp (trace-path #/b))
		(jump ubitblt-aligned-row-source-backwards))))
	 (if (greater-fixnum bb-s-bitpos b-temp) 	;s > d, enough in the current word
	     (sequential
	       (parallel-with-s-access
		 bb-s-offset
		 (assign bb-width (- bb-width bb-d-bitpos)) ;has to be done somewhere
		 (assign bb-s-word (logxor bb-constant memory-data)))
	       (parallel-with-d-access
		 bb-d-offset
		 (assign byte-s (1- bb-d-bitpos))
